Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 0/4] caching bundles, iteration 4
From: Timo Teras @ 2010-04-05  7:00 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

Changes since last iteration:
  - wrapped flow_cache_ops* in struct flow_cache_object for
    readability, Herbert's request
  - constified flow_cache_ops, noticed by Eric Dumazet
  - NETDEV_DOWN hook now calls garbage collect function to also process
    per-socket bundles (instead of the plain flow_cache_flush)
  - some coding style fixes

Timo Teras (4):
  flow: virtualize flow cache entry methods
  xfrm: cache bundles instead of policies for outgoing flows
  xfrm: remove policy garbage collection
  flow: delayed deletion of flow cache entries

 include/net/flow.h      |   23 ++-
 include/net/xfrm.h      |   12 +-
 net/core/flow.c         |  201 +++++++-----
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  818 +++++++++++++++++++++++++----------------------
 6 files changed, 584 insertions(+), 523 deletions(-)


^ permalink raw reply

* [RFC PATCH 2/2] netdev: an usage example on igb
From: Koki Sanagi @ 2010-04-05  6:54 UTC (permalink / raw)
  To: netdev
  Cc: izumi.taku, kaneshige.kenji, davem, nhorman, jeffrey.t.kirsher,
	jesse.brandeburg, bruce.w.allan, alexander.h.duyck,
	peter.p.waskiewicz.jr, john.ronciak
In-Reply-To: <4BB98828.5030302@jp.fujitsu.com>

This patch is usage example of previous patch's buffer on igb.
The output is like below.

# cat /sys/kernel/debug/ndrvbuf/igb-trace-0000\:03\:00.0/buffer
[  1] 50462.369207: clean_tx qidx=1 ntu=154->156
[  0] 50462.369241: clean_rx qidx=0 ntu=111->112
[  0] 50462.369250: xmit qidx=1 ntu=156->158
[  1] 50462.369256: clean_tx qidx=1 ntu=156->158
[  1] 50462.369342: clean_rx qidx=0 ntu=113->114
[  1] 50462.369439: clean_rx qidx=0 ntu=114->115

This example outputs original print style, because it sets original print
function(igb_trace_read) when registered.

register_ndrvbuf(buname, 1000000, igb_trace_read);

If you set NULL to arg3, outputs by ndrvbuf default style.
If you set 0 to size(arg2), recording is disabled at first(but small buffer is
alloced).
When you set non-zero to size, recording becomes enabled.

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
  drivers/net/igb/Makefile    |    2 +-
  drivers/net/igb/igb.h       |    1 +
  drivers/net/igb/igb_main.c  |   10 +++++-
  drivers/net/igb/igb_trace.c |   81 +++++++++++++++++++++++++++++++++++++++++++
  drivers/net/igb/igb_trace.h |   21 +++++++++++
  5 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/drivers/net/igb/Makefile b/drivers/net/igb/Makefile
index 8372cb9..286541e 100644
--- a/drivers/net/igb/Makefile
+++ b/drivers/net/igb/Makefile
@@ -33,5 +33,5 @@
  obj-$(CONFIG_IGB) += igb.o
  
  igb-objs := igb_main.o igb_ethtool.o e1000_82575.o \
-	    e1000_mac.o e1000_nvm.o e1000_phy.o e1000_mbx.o
+	    e1000_mac.o e1000_nvm.o e1000_phy.o e1000_mbx.o igb_trace.o
  
diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index a177570..533c5e6 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -315,6 +315,7 @@ struct igb_adapter {
  	unsigned int vfs_allocated_count;
  	struct vf_data_storage *vf_data;
  	u32 rss_queues;
+	struct ndrvbuf *trace;
  };
  
  #define IGB_FLAG_HAS_MSI           (1 << 0)
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 583a21c..f754fd1 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -48,6 +48,7 @@
  #include <linux/dca.h>
  #endif
  #include "igb.h"
+#include "igb_trace.h"
  
  #define DRV_VERSION "2.1.0-k2"
  char igb_driver_name[] = "igb";
@@ -1412,6 +1413,7 @@ static int __devinit igb_probe(struct pci_dev *pdev,
  	int err, pci_using_dac;
  	u16 eeprom_apme_mask = IGB_EEPROM_APME;
  	u32 part_num;
+	char bufname[NDRVBUF_NAME_SIZE];
  
  	err = pci_enable_device_mem(pdev);
  	if (err)
@@ -1674,6 +1676,8 @@ static int __devinit igb_probe(struct pci_dev *pdev,
  		(adapter->flags & IGB_FLAG_HAS_MSI) ? "MSI" : "legacy",
  		adapter->num_rx_queues, adapter->num_tx_queues);
  
+	sprintf(bufname, "igb-trace-%s", pci_name(pdev));
+	adapter->trace = register_ndrvbuf(bufname, 1000000, igb_trace_read);
  	return 0;
  
  err_register:
@@ -1734,6 +1738,7 @@ static void __devexit igb_remove(struct pci_dev *pdev)
  	 * would have already happened in close and is redundant. */
  	igb_release_hw_control(adapter);
  
+	unregister_ndrvbuf(adapter->trace);
  	unregister_netdev(netdev);
  
  	igb_clear_interrupt_scheme(adapter);
@@ -3814,6 +3819,7 @@ netdev_tx_t igb_xmit_frame_ring_adv(struct sk_buff *skb,
  	}
  
  	igb_tx_queue_adv(tx_ring, tx_flags, count, skb->len, hdr_len);
+	igb_trace_write_xmit(adapter->trace, tx_ring, first);
  
  	/* Make sure there is space in the ring for the next send. */
  	igb_maybe_stop_tx(tx_ring, MAX_SKB_FRAGS + 4);
@@ -5039,7 +5045,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
  		eop = tx_ring->buffer_info[i].next_to_watch;
  		eop_desc = E1000_TX_DESC_ADV(*tx_ring, eop);
  	}
-
+	igb_trace_write_clean_tx(adapter->trace, tx_ring, i);
  	tx_ring->next_to_clean = i;
  
  	if (unlikely(count &&
@@ -5195,6 +5201,7 @@ static bool igb_clean_rx_irq_adv(struct igb_q_vector *q_vector,
  {
  	struct igb_ring *rx_ring = q_vector->rx_ring;
  	struct net_device *netdev = rx_ring->netdev;
+	struct igb_adapter *adapter = netdev_priv(netdev);
  	struct pci_dev *pdev = rx_ring->pdev;
  	union e1000_adv_rx_desc *rx_desc , *next_rxd;
  	struct igb_buffer *buffer_info , *next_buffer;
@@ -5309,6 +5316,7 @@ next_desc:
  		staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
  	}
  
+	igb_trace_write_clean_rx(adapter->trace, rx_ring, i);
  	rx_ring->next_to_clean = i;
  	cleaned_count = igb_desc_unused(rx_ring);
  
diff --git a/drivers/net/igb/igb_trace.c b/drivers/net/igb/igb_trace.c
new file mode 100644
index 0000000..ab96300
--- /dev/null
+++ b/drivers/net/igb/igb_trace.c
@@ -0,0 +1,81 @@
+#include "igb_trace.h"
+
+struct trace_data_common {
+	unsigned short type;
+	u8 qidx;
+	int from;
+	int to;
+};
+
+void igb_trace_write_xmit(struct ndrvbuf *ndrvbuf,
+				struct igb_ring *tx_ring, int i)
+{
+	struct trace_data_common tdata;
+
+	tdata.type = IGB_TRACE_EVENT_XMIT;
+	tdata.qidx = tx_ring->queue_index;
+	tdata.from = i;
+	tdata.to = tx_ring->next_to_use;
+
+	write_ndrvbuf(ndrvbuf, sizeof(struct trace_data_common), &tdata);
+}
+
+void igb_trace_write_clean_tx(struct ndrvbuf *ndrvbuf,
+				struct igb_ring *tx_ring, int i)
+{
+	struct trace_data_common tdata;
+
+	tdata.type = IGB_TRACE_EVENT_CLEAN_TX;
+	tdata.qidx = tx_ring->queue_index;
+	tdata.from = tx_ring->next_to_clean;
+	tdata.to = i;
+
+	write_ndrvbuf(ndrvbuf, sizeof(struct trace_data_common), &tdata);
+}
+
+void igb_trace_write_clean_rx(struct ndrvbuf *ndrvbuf,
+				struct igb_ring *rx_ring, int i)
+{
+	struct trace_data_common tdata;
+
+	tdata.type = IGB_TRACE_EVENT_CLEAN_RX;
+	tdata.qidx = rx_ring->queue_index;
+	tdata.from = rx_ring->next_to_clean;
+	tdata.to = i;
+
+	write_ndrvbuf(ndrvbuf, sizeof(struct trace_data_common), &tdata);
+}
+
+size_t igb_trace_read(void *ubuf, size_t bufsize, void *entry,
+			size_t size, int cpu, u64 ts)
+{
+	struct trace_data_common *tdata = entry;
+	int bufpos = 0;
+	size_t headlen;
+
+	headlen = typical_ndrvbuf_header(ubuf, bufsize, cpu, ts);
+	bufpos += headlen;
+
+	switch (tdata->type) {
+	case IGB_TRACE_EVENT_XMIT:
+		bufpos += snprintf(ubuf + bufpos, bufsize - bufpos,
+					"xmit qidx=%u ntu=%d->%d\n",
+					tdata->qidx, tdata->from, tdata->to);
+		break;
+	case IGB_TRACE_EVENT_CLEAN_TX:
+		bufpos += snprintf(ubuf + bufpos, bufsize - bufpos,
+					"clean_tx qidx=%u ntu=%d->%d\n",
+					tdata->qidx, tdata->from, tdata->to);
+		break;
+	case IGB_TRACE_EVENT_CLEAN_RX:
+		bufpos += snprintf(ubuf + bufpos, bufsize - bufpos,
+					"clean_rx qidx=%u ntu=%d->%d\n",
+					tdata->qidx, tdata->from, tdata->to);
+		break;
+	default:
+		bufpos += snprintf(ubuf + bufpos, bufsize - bufpos,
+					"EVENT ID:%u is not defined\n",
+					tdata->type);
+	}
+	return bufpos;
+}
diff --git a/drivers/net/igb/igb_trace.h b/drivers/net/igb/igb_trace.h
new file mode 100644
index 0000000..fa130e1
--- /dev/null
+++ b/drivers/net/igb/igb_trace.h
@@ -0,0 +1,21 @@
+#ifndef _IGB_TRACE_H_
+#define _IGB_TRACE_H_
+
+#include <linux/ndrvbuf.h>
+#include "igb.h"
+
+#define IGB_TRACE_EVENT_XMIT		0x01
+#define IGB_TRACE_EVENT_CLEAN_TX	0x02
+#define IGB_TRACE_EVENT_CLEAN_RX	0x03
+
+extern void igb_trace_write_xmit(struct ndrvbuf *ndrvbuf,
+			struct igb_ring *tx_ring, int i);
+extern void igb_trace_write_clean_tx(struct ndrvbuf *ndrvbuf,
+			struct igb_ring *tx_ring, int i);
+extern void igb_trace_write_clean_rx(struct ndrvbuf *ndrvbuf,
+			struct igb_ring *rx_ring, int i);
+
+extern size_t igb_trace_read(void *ubuf, size_t bufsize, void *entry,
+			size_t size, int cpu, u64 ts);
+
+#endif /*_IGB_TRACE_H_*/


^ permalink raw reply related

* [RFC PATCH 1/2] netdev: buffer infrastructure to log network driver's information
From: Koki Sanagi @ 2010-04-05  6:52 UTC (permalink / raw)
  To: netdev
  Cc: izumi.taku, kaneshige.kenji, davem, nhorman, jeffrey.t.kirsher,
	jesse.brandeburg, bruce.w.allan, alexander.h.duyck,
	peter.p.waskiewicz.jr, john.ronciak
In-Reply-To: <4BB98828.5030302@jp.fujitsu.com>

This patch implements buffer infrastructure under driver/net.
This buffer records information from network driver.

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
  drivers/net/Kconfig     |    8 +
  drivers/net/Makefile    |    1 +
  drivers/net/ndrvbuf.c   |  535 +++++++++++++++++++++++++++++++++++++++++++++++
  include/linux/ndrvbuf.h |   57 +++++
  4 files changed, 601 insertions(+), 0 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 7029cd5..98ac929 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -219,6 +219,14 @@ config MII
  	  or internal device.  It is safe to say Y or M here even if your
  	  ethernet card lack MII.
  
+config NDRVBUF
+	tristate "Use buffer for network device driver"
+	default m
+	help
+	  The ndrvbuf is a generally buffer for network driver. It can record
+	  some event or logging informaiton you want to preseve. ring_buffer
+	  is used as a buffer infrastructure.
+
  config MACB
  	tristate "Atmel MACB support"
  	depends on AVR32 || ARCH_AT91SAM9260 || ARCH_AT91SAM9263 || ARCH_AT91SAM9G20 || ARCH_AT91SAM9G45 || ARCH_AT91CAP9
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 4788862..84319ba 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -5,6 +5,7 @@
  obj-$(CONFIG_MII) += mii.o
  obj-$(CONFIG_MDIO) += mdio.o
  obj-$(CONFIG_PHYLIB) += phy/
+obj-$(CONFIG_NDRVBUF) += ndrvbuf.o
  
  obj-$(CONFIG_TI_DAVINCI_EMAC) += davinci_emac.o
  
diff --git a/drivers/net/ndrvbuf.c b/drivers/net/ndrvbuf.c
new file mode 100644
index 0000000..7401334
--- /dev/null
+++ b/drivers/net/ndrvbuf.c
@@ -0,0 +1,535 @@
+#include <linux/ndrvbuf.h>
+
+MODULE_LICENSE("GPL");
+static struct dentry *ndrvbuf_root;
+
+/* This list links all ndrvbuf registered */
+static LIST_HEAD(ndrvbuf_list);
+static DEFINE_MUTEX(ndrvbuf_list_lock);
+static atomic_t ndrvbuf_disabled;
+/* control reading ring_buffer */
+struct ndrvbuf_reader {
+	struct ndrvbuf *ndrvbuf;
+	loff_t idx;
+	struct ring_buffer_iter **iter;
+	char *print;
+	size_t print_len;
+	loff_t print_pos;
+};
+
+/**
+ * typical_ndrvbuf_header - write a typical header
+ * @dst: buffer to write
+ * @cpu: The processor number
+ * @ts: The time stamp
+ *
+ * Write a typical header to buffer
+ **/
+size_t _typical_ndrvbuf_header(void *dst, size_t size, int cpu, u64 ts)
+{
+	int ret;
+	unsigned long usecs_rem, secs;
+
+	ts += 500;
+	do_div(ts, NSEC_PER_USEC);
+	usecs_rem = do_div(ts, USEC_PER_SEC);
+	secs = (unsigned long)ts;
+	ret = snprintf(dst, size, "[%3d] %5lu.%06lu: ", cpu, secs, usecs_rem);
+	return ret;
+}
+EXPORT_SYMBOL(_typical_ndrvbuf_header);
+
+/**
+ * ndrvbuf_default_read - print the event using hex format
+ * @ubuf: The buffer to write
+ * @bufsize: The maximum byte to write
+ * @entry: The adrress to read
+ * @len: The size of etnry
+ * @cpu: The processor nubmer
+ * @ts: The time stamp
+ *
+ * If read==NULL in register_ndrvbuf, this funcion is used when printing.
+ **/
+static size_t ndrvbuf_default_read(void *ubuf, size_t bufsize, void *entry,
+				unsigned len, int cpu, u64 ts)
+{
+	int bufpos = 0, lpos = 0;
+
+	bufpos += _typical_ndrvbuf_header(ubuf, bufsize, cpu, ts);
+	bufpos += snprintf(ubuf + bufpos, bufsize - bufpos, "\n");
+	while (lpos < len) {
+		hex_dump_to_buffer(entry + lpos, len - lpos, 16, 4,
+				ubuf + bufpos, bufsize - bufpos, 0);
+		bufpos = strlen(ubuf);
+		bufpos += snprintf(ubuf + bufpos, bufsize - bufpos, "\n");
+		lpos += 16;
+	}
+	return bufpos;
+}
+
+static int is_in_ndrvbuf_list(struct ndrvbuf *ndrvbuf)
+{
+	struct list_head *cur;
+	struct ndrvbuf *cbuf;
+
+	list_for_each(cur, &ndrvbuf_list) {
+		cbuf = list_entry(cur, struct ndrvbuf, list);
+		if (cbuf == ndrvbuf)
+			break;
+	}
+	if (cur == &ndrvbuf_list)
+		return 0;
+	else
+		return 1;
+}
+
+/**
+ * ndrvbuf_buffer_open - open ring_buffer and set itearator to read
+ * @inode: contain ndrvbuf pointer
+ * @file: The pointer to attach the ndrvbuf pointer
+ **/
+static int ndrvbuf_buffer_open(struct inode *inode, struct file *file)
+{
+	struct ndrvbuf *ndrvbuf = inode->i_private;
+	struct ndrvbuf_reader *reader;
+	int cpu;
+
+	reader = kzalloc(sizeof(struct ndrvbuf_reader), GFP_KERNEL);
+	if (!reader) {
+		pr_warning("Could not alloc reader->iter\n");
+		goto out;
+	}
+	reader->iter = kmalloc(sizeof(struct ring_buffer_iter *) * nr_cpu_ids,
+				GFP_KERNEL);
+	if (!reader->iter) {
+		pr_warning("Could not alloc reader->iter\n");
+		goto out_free_reader;
+	}
+	mutex_lock(&ndrvbuf_list_lock);
+	if (!is_in_ndrvbuf_list(ndrvbuf)) {
+		mutex_unlock(&ndrvbuf_list_lock);
+		goto out_free_iter;
+	}
+	atomic_inc(&ndrvbuf->reader_count);
+	mutex_unlock(&ndrvbuf_list_lock);
+	for_each_online_cpu(cpu)
+		reader->iter[cpu] = ring_buffer_read_start(ndrvbuf->rbuf, cpu);
+	reader->print = kmalloc(NDRVBUF_STR_LEN, GFP_KERNEL);
+	if (!reader->print) {
+		pr_warning("Could not alloc reader->print\n");
+		goto out_free_iter;
+	}
+	reader->ndrvbuf = ndrvbuf;
+	file->private_data = reader;
+	return 0;
+
+out_free_iter:
+	kfree(reader->iter);
+out_free_reader:
+	kfree(reader);
+out:
+	return -ENOMEM;
+}
+
+static int ndrvbuf_buffer_release(struct inode *inode, struct file *file)
+{
+	struct ndrvbuf_reader *reader = file->private_data;
+	struct ndrvbuf *ndrvbuf = reader->ndrvbuf;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		ring_buffer_read_finish(reader->iter[cpu]);
+	}
+	kfree(reader->iter);
+	kfree(reader->print);
+	kfree(reader);
+	atomic_dec(&ndrvbuf->reader_count);
+	return 0;
+}
+
+static loff_t _ndrvbuf_lseek(struct ndrvbuf_reader *reader, loff_t pos)
+{
+	struct ndrvbuf *ndrvbuf = reader->ndrvbuf;
+	struct ring_buffer_iter **iter = reader->iter;
+	struct ring_buffer_event *event;
+	void *entry;
+	int cpu, next_cpu;
+	u64 ts, next_ts;
+	unsigned len;
+	int strlen;
+
+	reader->idx = 0;
+	for_each_online_cpu(cpu) {
+		ring_buffer_iter_reset(iter[cpu]);
+	}
+	while (1) {
+		next_ts = 0;
+		next_cpu = -1;
+		for_each_online_cpu(cpu) {
+			if (!iter[cpu])
+				continue;
+			event = ring_buffer_iter_peek(iter[cpu], &ts);
+			if (!event)
+				continue;
+			if (!next_ts || ts < next_ts) {
+				next_ts = ts;
+				next_cpu = cpu;
+			}
+		}
+		if (next_cpu < 0)
+			return -EINVAL;
+		event = ring_buffer_read(iter[next_cpu], &ts);
+		if (!event)
+			return -EINVAL;
+		entry = ring_buffer_event_data(event);
+		len = ring_buffer_event_length(event);
+		strlen = ndrvbuf->read(reader->print, NDRVBUF_STR_LEN,
+					entry, len, next_cpu, ts);
+		if (reader->idx + strlen > pos)
+			break;
+		reader->idx += strlen;
+	}
+	reader->print_len = strlen;
+	reader->print_pos = reader->idx + strlen - pos;
+	reader->idx = pos;
+	return pos;
+}
+
+/**
+ * ndrvbuf_buffer_read - read event and write it to user buffer
+ * @file: the file to read
+ * @buf: the file to write
+ * @nbytes: the maximum size to write
+ * @ppos: the position to read from
+ *
+ * Read an event from ring_buffer and write it user buffer.
+ * If read format function is set, use it.
+ **/
+static ssize_t ndrvbuf_buffer_read(struct file *file, char __user *buf,
+					size_t nbytes, loff_t *ppos)
+{
+	struct ndrvbuf_reader *reader = file->private_data;
+	struct ndrvbuf *ndrvbuf = reader->ndrvbuf;
+	struct ring_buffer_iter **iter = reader->iter;
+	struct ring_buffer_event *event;
+	void *entry;
+	unsigned len;
+	int cpu, next_cpu = -1;
+	u64 ts, next_ts = 0;
+	int ret;
+	size_t copy, strlen;
+	loff_t pos = *ppos;
+
+	if (pos != reader->idx)
+		_ndrvbuf_lseek(reader, pos);
+	if (reader->print_len) {
+		copy = min(reader->print_len, nbytes);
+		ret = copy_to_user(buf, reader->print + reader->print_pos,
+				copy);
+		copy -= ret;
+		reader->print_len -= copy;
+		reader->print_pos += copy;
+		reader->idx += copy;
+		*ppos += copy;
+		return copy;
+	}
+	for_each_online_cpu(cpu) {
+		if (!iter[cpu])
+			continue;
+		event = ring_buffer_iter_peek(iter[cpu], &ts);
+		if (!event)
+			continue;
+		if (!next_ts || ts < next_ts) {
+			next_ts = ts;
+			next_cpu = cpu;
+		}
+	}
+	if (next_cpu < 0)
+		return 0;
+	event = ring_buffer_read(iter[next_cpu], &ts);
+	if (!event)
+		return 0;
+	entry = ring_buffer_event_data(event);
+	len = ring_buffer_event_length(event);
+
+	strlen = ndrvbuf->read(reader->print, NDRVBUF_STR_LEN, entry,
+				len, next_cpu, ts);
+	reader->print_len = strlen;
+	copy = min(strlen, nbytes);
+	ret = copy_to_user(buf, reader->print, strlen);
+	copy -= ret;
+	reader->print_len -= copy;
+	reader->print_pos = copy;
+	*ppos = pos + copy;
+	reader->idx = *ppos;
+	return copy;
+}
+
+
+static const struct file_operations buffer_file_ops = {
+	.owner =	THIS_MODULE,
+	.open =		ndrvbuf_buffer_open,
+	.read =		ndrvbuf_buffer_read,
+	.release =	ndrvbuf_buffer_release,
+};
+
+static int ndrvbuf_buffer_size_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t ndrvbuf_buffer_size_read(struct file *file, char __user *ubuf,
+					size_t nbytes, loff_t *ppos)
+{
+	struct ndrvbuf *ndrvbuf = file->private_data;
+	int ret;
+	char buf[16];
+	ssize_t size;
+
+	mutex_lock(&ndrvbuf_list_lock);
+	if (!is_in_ndrvbuf_list(ndrvbuf)) {
+		mutex_unlock(&ndrvbuf_list_lock);
+		return -ENODEV;
+
+	}
+	ret = snprintf(buf, 16, "%lu\n", ndrvbuf->buffer_size);
+	size = simple_read_from_buffer(ubuf, nbytes, ppos, buf, ret);
+	mutex_unlock(&ndrvbuf_list_lock);
+	return size;
+}
+
+/**
+ * ndrvbuf_buffer_size_write - resize the size of ring_buffer
+ * @file: the file to read
+ * @buf: the file to write
+ * @nbytes: the maximum size to write
+ * @ppos: the position to read from
+ **/
+static ssize_t ndrvbuf_buffer_size_write(struct file *file,
+			const char __user *ubuf, size_t nbytes, loff_t *ppos)
+{
+	struct ndrvbuf *ndrvbuf = file->private_data;
+	unsigned long cur_size = ndrvbuf->buffer_size;
+	unsigned long new_size;
+	int ret;
+	char buf[64];
+
+	if (nbytes >= sizeof(buf))
+		return -EINVAL;
+	if (copy_from_user(&buf, ubuf, nbytes))
+		return -EFAULT;
+	buf[nbytes] = 0;
+	ret = strict_strtoul(buf, 10, &new_size);
+	if (ret < 0)
+		return ret;
+	if (cur_size == new_size)
+		return nbytes;
+schedule:
+	mutex_lock(&ndrvbuf_list_lock);
+	if (atomic_read(&ndrvbuf->reader_count) != 0) {
+		mutex_unlock(&ndrvbuf_list_lock);
+		schedule();
+		goto schedule;
+	}
+	atomic_inc(&ndrvbuf_disabled);
+	synchronize_sched();
+	if (!is_in_ndrvbuf_list(ndrvbuf)) {
+		atomic_dec(&ndrvbuf_disabled);
+		mutex_unlock(&ndrvbuf_list_lock);
+		return -ENODEV;
+	}
+	ret = ring_buffer_resize(ndrvbuf->rbuf, new_size);
+	if (ret < 0)
+		pr_warning("Could not change buffer size\n");
+	ndrvbuf->buffer_size = new_size;
+	if (new_size)
+		atomic_set(&ndrvbuf->disabled, 0);
+	else
+		atomic_set(&ndrvbuf->disabled, 1);
+	atomic_dec(&ndrvbuf_disabled);
+	mutex_unlock(&ndrvbuf_list_lock);
+
+	return nbytes;
+}
+
+static const struct file_operations buffer_size_file_ops = {
+	.owner =	THIS_MODULE,
+	.open =		ndrvbuf_buffer_size_open,
+	.read =		ndrvbuf_buffer_size_read,
+	.write =	ndrvbuf_buffer_size_write,
+};
+
+static struct ndrvbuf *create_ndrvbuf(const char *name, size_t size)
+{
+	struct ndrvbuf *ndrvbuf;
+	struct dentry *buf_dir;
+
+	ndrvbuf = kzalloc(sizeof(struct ndrvbuf), GFP_KERNEL);
+	if (!ndrvbuf)
+		goto out;
+	strcpy(ndrvbuf->name, name);
+	buf_dir = debugfs_create_dir(name, ndrvbuf_root);
+	if (!buf_dir) {
+		pr_warning("Could not create debugfs dir '%s'\n", name);
+		goto out_free_ndrvbuf;
+	}
+	ndrvbuf->buf_dir = buf_dir;
+	ndrvbuf->buffer_dent = debugfs_create_file("buffer",
+				S_IFREG|S_IRUGO|S_IWUSR,
+				buf_dir, ndrvbuf, &buffer_file_ops);
+	if (!ndrvbuf->buffer_dent) {
+		pr_warning("Could not create debugfs file 'buffer'\n");
+		goto out_rem_buf_dir;
+	}
+	ndrvbuf->buffer_size_dent = debugfs_create_file("buffer_size",
+				S_IFREG|S_IRUGO|S_IWUSR,
+				buf_dir, ndrvbuf, &buffer_size_file_ops);
+	if (!ndrvbuf->buffer_size_dent) {
+		pr_warning("Could not create debugfs file 'buffer_size'\n");
+		goto out_rem_buffer;
+	}
+	ndrvbuf->rbuf = ring_buffer_alloc(size, RB_FL_OVERWRITE);
+	if (!ndrvbuf->rbuf) {
+		pr_warning("Could not alloc ring_buffer for %s\n",
+							name);
+		goto out_rem_buffer_size;
+	}
+	ndrvbuf->buffer_size = size;
+	if (size)
+		atomic_set(&ndrvbuf->disabled, 0);
+	else
+		atomic_set(&ndrvbuf->disabled, 1);
+	return ndrvbuf;
+
+out_rem_buffer_size:
+	debugfs_remove(ndrvbuf->buffer_size_dent);
+out_rem_buffer:
+	debugfs_remove(ndrvbuf->buffer_dent);
+out_rem_buf_dir:
+	debugfs_remove(ndrvbuf->buf_dir);
+out_free_ndrvbuf:
+	kfree(ndrvbuf);
+out:
+	return NULL;
+}
+
+static void remove_ndrvbuf(struct ndrvbuf *ndrvbuf)
+{
+	if (!ndrvbuf)
+		return;
+	debugfs_remove(ndrvbuf->buffer_size_dent);
+	debugfs_remove(ndrvbuf->buffer_dent);
+	debugfs_remove(ndrvbuf->buf_dir);
+	ring_buffer_free(ndrvbuf->rbuf);
+	kfree(ndrvbuf);
+}
+
+/**
+ * register_ndrvbuf - create ndrvbuf struct
+ * @name: The buffer dif name. Usually, it is created under
+ *        /sys/kernel/debug/ndrvbuf
+ * @size: The size of ring_buffer per cpu
+ * @read: The read format funcion. If NULL, use ndrvbuf_default_read.
+ *
+ * This is called when network driver want to set ndrvbuf.
+ * If registering is failed, return NULL.
+ **/
+struct ndrvbuf *_register_ndrvbuf(const char *name, size_t size,
+		size_t (*read)(void *, size_t, void *, size_t, int, u64))
+{
+	struct ndrvbuf *cbuf;
+	struct list_head *cur;
+
+	mutex_lock(&ndrvbuf_list_lock);
+	atomic_inc(&ndrvbuf_disabled);
+	synchronize_sched();
+	list_for_each(cur, &ndrvbuf_list) {
+		cbuf = list_entry(cur, struct ndrvbuf, list);
+		if (!strncmp(cbuf->name, name, NDRVBUF_NAME_SIZE))
+			break;
+	}
+	if (cur != &ndrvbuf_list) {
+		pr_warning("%s already exists\n", name);
+		cbuf = NULL;
+		goto out;
+	}
+	cbuf = create_ndrvbuf(name, size);
+	if (!cbuf)
+		goto out;
+
+	if (read)
+		cbuf->read = read;
+	else
+		cbuf->read = ndrvbuf_default_read;
+	list_add(&cbuf->list, &ndrvbuf_list);
+out:
+	atomic_dec(&ndrvbuf_disabled);
+	mutex_unlock(&ndrvbuf_list_lock);
+	return cbuf;
+}
+EXPORT_SYMBOL(_register_ndrvbuf);
+
+/**
+ * unregister_ndrvbuf - free ndrbuf struct and some resources
+ * @ndrvbuf: The pointer of ndrvbuf
+ **/
+void _unregister_ndrvbuf(struct ndrvbuf *ndrvbuf)
+{
+schedule:
+	mutex_lock(&ndrvbuf_list_lock);
+	if (atomic_read(&ndrvbuf->reader_count) != 0) {
+		mutex_unlock(&ndrvbuf_list_lock);
+		schedule();
+		goto schedule;
+	}
+	atomic_inc(&ndrvbuf_disabled);
+	synchronize_sched();
+	list_del(&ndrvbuf->list);
+	atomic_dec(&ndrvbuf_disabled);
+	mutex_unlock(&ndrvbuf_list_lock);
+
+	remove_ndrvbuf(ndrvbuf);
+}
+EXPORT_SYMBOL(_unregister_ndrvbuf);
+
+/**
+ * _write_ndrvbuf - Write a trace event to buffer
+ * @ndrvbuf: buffer to write
+ * @size: The size of trace event
+ * @buf: The pointer to read from
+ **/
+void _write_ndrvbuf(struct ndrvbuf *ndrvbuf, size_t size, void *buf)
+{
+	unsigned long flags;
+
+	preempt_disable();
+	local_irq_save(flags);
+	if (!atomic_read(&ndrvbuf_disabled) && is_in_ndrvbuf_list(ndrvbuf)
+		&& !atomic_read(&ndrvbuf->disabled))
+		ring_buffer_write(ndrvbuf->rbuf, size, buf);
+	local_irq_restore(flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_ndrvbuf);
+
+static int __init ndrvbuf_init(void)
+{
+	ndrvbuf_root = debugfs_create_dir("ndrvbuf", NULL);
+	if (!ndrvbuf_root) {
+		pr_warning("Could not create debugfs dir 'ndrvbuf'\n");
+		return -ENODEV;
+	}
+	atomic_set(&ndrvbuf_disabled, 0);
+	return 0;
+}
+
+module_init(ndrvbuf_init);
+
+static void __exit ndrvbuf_exit(void)
+{
+	if (ndrvbuf_root)
+		debugfs_remove(ndrvbuf_root);
+}
+
+module_exit(ndrvbuf_exit);
diff --git a/include/linux/ndrvbuf.h b/include/linux/ndrvbuf.h
new file mode 100644
index 0000000..1d56b79
--- /dev/null
+++ b/include/linux/ndrvbuf.h
@@ -0,0 +1,57 @@
+#ifndef _NDRVBUF_H_
+#define _NDRVBUF_H_
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/ring_buffer.h>
+#include <linux/debugfs.h>
+#include <linux/netdevice.h>
+
+#define NDRVBUF_STR_LEN (2*PAGE_SIZE)
+#define NDRVBUF_NAME_SIZE 32
+
+struct ndrvbuf {
+	struct list_head list;
+	char name[NDRVBUF_NAME_SIZE];
+	struct dentry *buf_dir;
+	struct dentry *enable_dent;
+	struct dentry *buffer_size_dent;
+	struct dentry *buffer_dent;
+	unsigned long buffer_size;
+	atomic_t disabled;
+	size_t (*read)(void *ubuf, size_t bufsize, void *entry,
+			size_t size, int cpu, u64 ts);
+	struct ring_buffer *rbuf;
+	atomic_t reader_count;
+};
+
+extern __attribute__((weak)) struct ndrvbuf *_register_ndrvbuf(
+		const char *name, size_t size,
+		size_t (*read)(void *, size_t, void *, size_t, int, u64));
+extern __attribute__((weak)) void _unregister_ndrvbuf(struct ndrvbuf *ndrvbuf);
+extern __attribute__((weak)) void _write_ndrvbuf(struct ndrvbuf *ndrvbuf,
+		size_t size, void *buf);
+extern __attribute__((weak)) size_t _typical_ndrvbuf_header(void *dst,
+		size_t size, int cpu, u64 ts);
+
+#define register_ndrvbuf(name, size, read) ((_register_ndrvbuf) ? \
+	_register_ndrvbuf(name, size, read) : NULL)
+
+#define unregister_ndrvbuf(ndrvbuf)					\
+	do {								\
+		if (_unregister_ndrvbuf)				\
+			_unregister_ndrvbuf(ndrvbuf);			\
+	} while (0)
+
+#define write_ndrvbuf(ndrvbuf, size, buf)				\
+	do {								\
+		if (_write_ndrvbuf)					\
+			_write_ndrvbuf(ndrvbuf, size, buf);		\
+	} while (0)
+
+#define typical_ndrvbuf_header(dst, size, cpu, ts) (			\
+	(_typical_ndrvbuf_header) ? 					\
+	_typical_ndrvbuf_header(dst, size, cpu, ts) : 0)
+
+#endif /* _NDRVBUF_H_ */


^ permalink raw reply related

* [RFC PATCH 0/2] netdev: implement a buffer to log network driver's information
From: Koki Sanagi @ 2010-04-05  6:50 UTC (permalink / raw)
  To: netdev
  Cc: izumi.taku, kaneshige.kenji, davem, nhorman, jeffrey.t.kirsher,
	jesse.brandeburg, bruce.w.allan, alexander.h.duyck,
	peter.p.waskiewicz.jr, john.ronciak

This patch implements a buffer for recording network driver's message.
This patch extends below patch to make other network driver use it.

http://marc.info/?l=e1000-devel&m=126690500618157&w=2

When I investigate some network driver's trouble, I feel like I want more
detailed debug information, for example, all device register information
(like ethtool -d) when device reset was happened or tx/rx ring's move when
network stream is not smooth etc.
As a recording measure of such information, there are syslog and ftrace now.
but they have some weak points.
Syslog is not appropriate for the size of message is large(ex. recording
all register information) or the number of is large(ex.tracing internal move).
Ftrace is appropriate for recording such messages.
But ftrace has only one buffer for all ftrace event in kernel. As a result,
one adapter's event may be flushed by the others(Of course, syslog has same
weak point).

This patch implements a buffer system which beats those weak points.

Features of that are

1.Each interface can hold respective buffer.
     It prevents recorded data from being flowed by other's recorded data.

2.An interface can hold multi-buffers.
     It makes one adapter have several buffer for each different purpose.
     For example, one is to trace a driver's internal move, the another is to
     log error message and some releveant information.

3.resize and on/off per buffer.
     if you implement this patch's buffer and you regist buffer in driver's
	probe function, all adapter which use that driver must have same size
     buffer. If you want trace to one adapter, but not to another, you can make
     another adapter's buffer off.

The implementation example of igb is patch 2.

HOW TO USE:

If you want to know how to use from driver side, see patch 2.
User side is below.

# mount -t debugfs nodev /sys/kernel/debug
# ls /sys/kernel/debug/ndrvbuf
igb-trace-0000:03:00.0 igb-trace-0000:03:00.1
# ls /sys/kernel/debug/ndrvbuf/igb-trace-0000:03:00.0
buffer  buffer_size

"buffer" is output interface. If you set read_format function in
register_ndrvbuf, it is used. If not, default read function is used.
It displays recorded data by hex style.

# cat buffer
[  1] 50462.369207: clean_tx qidx=1 ntu=154->156
[  0] 50462.369241: clean_rx qidx=0 ntu=111->112
[  0] 50462.369250: xmit qidx=1 ntu=156->158
[  1] 50462.369256: clean_tx qidx=1 ntu=156->158
[  1] 50462.369342: clean_rx qidx=0 ntu=113->114
[  1] 50462.369439: clean_rx qidx=0 ntu=114->115

"buffer_size" is size of buffer per CPU. If you want to change that,
# echo 1000000 > buffer_size
# cat buffer_size
1000000

If you want to disable recording,
# echo 0 > buffer_size

Thanks,
Koki Sanagi.

^ permalink raw reply

* [PATCH] mac80211: Ensure initializing private mc_list in prepare_multicast().
From: YOSHIFUJI Hideaki @ 2010-04-05  3:59 UTC (permalink / raw)
  To: davem; +Cc: jpirko, yoshfuji, netdev

Fix kernel panic by NULL pointer dereference in the context of
ieee80211_ops->prepare_multicast().

This bug was introduced by commit 22bedad3c.. ("net: convert
multicast list to list_head").

Call __hw_addr_init() in ieee80211_alloc_hw() to initialize
list_head of private device multicast list, like we do in
bond_init().

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/mac80211/main.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 84ad249..0b82cd2 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -388,6 +388,9 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	local->uapsd_max_sp_len = IEEE80211_DEFAULT_MAX_SP_LEN;

 	INIT_LIST_HEAD(&local->interfaces);
+
+	__hw_addr_init(&local->mc_list);
+
 	mutex_init(&local->iflist_mtx);
 	mutex_init(&local->scan_mtx);

-- 
1.5.6.5

^ permalink raw reply related

* Re: [PATCH 1/3] IPv6: Generic TTL Security Mechanism (original version)
From: YOSHIFUJI Hideaki @ 2010-04-05  4:48 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: davem, Pekka Savola, Nick Hilliard, netdev, YOSHIFUJI Hideaki
In-Reply-To: <20100403232922.489187907@vyatta.com>

Hi,

(2010/04/04 8:21), Stephen Hemminger wrote:
> The original proposed code; the IPV6 and IPV4 socket options are seperate.
> With this method, the server does have to deal with both IPv4 and IPv6
> socket options and the client has to handle the different for each
> family.

I am for 1/3 (original), not for 2/3, 3/3.

Because we should allow users to set respective value
for IPv4 and IPv6, as we allow users to do so for TTL
and hoplimit itself.

--yoshfuji


^ permalink raw reply

* [PATCH] ARM: dmabounce: fix partial sync in dma_sync_single_* API
From: FUJITA Tomonori @ 2010-04-05  3:39 UTC (permalink / raw)
  To: linux; +Cc: linux-arm-kernel, netdev, davem, linux-kernel

I don't have arm hardware that uses dmabounce so I can't confirm the
problem but seems that dmabounce doesn't work for some drivers...

=
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Subject: [PATCH] ARM: dmabounce: fix partial sync in dma_sync_single_* API

Some network drivers do a partial sync with
dma_sync_single_for_{device|cpu}. The dma_addr argument might not be
the same as one as passed into the mapping API.

This adds some tricks to find_safe_buffer() for
dma_sync_single_for_{device|cpu}.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
---
 arch/arm/common/dmabounce.c |   31 +++++++++++++++++++++----------
 1 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index cc0a932..87eb160 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -163,7 +163,8 @@ alloc_safe_buffer(struct dmabounce_device_info *device_info, void *ptr,
 
 /* determine if a buffer is from our "safe" pool */
 static inline struct safe_buffer *
-find_safe_buffer(struct dmabounce_device_info *device_info, dma_addr_t safe_dma_addr)
+find_safe_buffer(struct dmabounce_device_info *device_info, dma_addr_t safe_dma_addr,
+		 int for_sync)
 {
 	struct safe_buffer *b, *rb = NULL;
 	unsigned long flags;
@@ -171,10 +172,17 @@ find_safe_buffer(struct dmabounce_device_info *device_info, dma_addr_t safe_dma_
 	read_lock_irqsave(&device_info->lock, flags);
 
 	list_for_each_entry(b, &device_info->safe_buffers, node)
-		if (b->safe_dma_addr == safe_dma_addr) {
-			rb = b;
-			break;
-		}
+		if (for_sync) {
+			if (b->safe_dma_addr <= safe_dma_addr &&
+			    safe_dma_addr < b->safe_dma_addr + b->size) {
+				rb = b;
+				break;
+			}
+		} else
+			if (b->safe_dma_addr == safe_dma_addr) {
+				rb = b;
+				break;
+			}
 
 	read_unlock_irqrestore(&device_info->lock, flags);
 	return rb;
@@ -205,7 +213,8 @@ free_safe_buffer(struct dmabounce_device_info *device_info, struct safe_buffer *
 /* ************************************************** */
 
 static struct safe_buffer *find_safe_buffer_dev(struct device *dev,
-		dma_addr_t dma_addr, const char *where)
+						dma_addr_t dma_addr, const char *where,
+						int for_sync)
 {
 	if (!dev || !dev->archdata.dmabounce)
 		return NULL;
@@ -216,7 +225,7 @@ static struct safe_buffer *find_safe_buffer_dev(struct device *dev,
 			pr_err("unknown device: Trying to %s invalid mapping\n", where);
 		return NULL;
 	}
-	return find_safe_buffer(dev->archdata.dmabounce, dma_addr);
+	return find_safe_buffer(dev->archdata.dmabounce, dma_addr, for_sync);
 }
 
 static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
@@ -286,7 +295,7 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
 static inline void unmap_single(struct device *dev, dma_addr_t dma_addr,
 		size_t size, enum dma_data_direction dir)
 {
-	struct safe_buffer *buf = find_safe_buffer_dev(dev, dma_addr, "unmap");
+	struct safe_buffer *buf = find_safe_buffer_dev(dev, dma_addr, "unmap", 0);
 
 	if (buf) {
 		BUG_ON(buf->size != size);
@@ -398,7 +407,7 @@ int dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr,
 	dev_dbg(dev, "%s(dma=%#x,off=%#lx,sz=%zx,dir=%x)\n",
 		__func__, addr, off, sz, dir);
 
-	buf = find_safe_buffer_dev(dev, addr, __func__);
+	buf = find_safe_buffer_dev(dev, addr, __func__, 1);
 	if (!buf)
 		return 1;
 
@@ -411,6 +420,8 @@ int dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr,
 	DO_STATS(dev->archdata.dmabounce->bounce_count++);
 
 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) {
+		if (addr != buf->safe_dma_addr)
+			off = addr - buf->safe_dma_addr;
 		dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n",
 			__func__, buf->safe + off, buf->ptr + off, sz);
 		memcpy(buf->ptr + off, buf->safe + off, sz);
@@ -427,7 +438,7 @@ int dmabounce_sync_for_device(struct device *dev, dma_addr_t addr,
 	dev_dbg(dev, "%s(dma=%#x,off=%#lx,sz=%zx,dir=%x)\n",
 		__func__, addr, off, sz, dir);
 
-	buf = find_safe_buffer_dev(dev, addr, __func__);
+	buf = find_safe_buffer_dev(dev, addr, __func__, 1);
 	if (!buf)
 		return 1;
 
-- 
1.7.0


^ permalink raw reply related

* [PATCH][iproute2] Detect 6rd kernel missing support / 6rd tunnel scope
From: Alexandre Cassen @ 2010-04-04 20:40 UTC (permalink / raw)
  To: netdev

This patch fix two issues:

* If kernel is not supporting 6rd then ioctl() call
  will return EINVAL, if so just skip perror call.

* 6rd scope is ipv6/ip tunnels. Dont try to fetch
  6rd tunnel parms if tunnel protocol != IPPROTO_IPV6.

Signed-off-by: Alexandre Cassen <acassen@freebox.fr>
---
 ip/iptunnel.c |    2 +-
 ip/tunnel.c   |   11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/ip/iptunnel.c b/ip/iptunnel.c
index 1cd9fbd..3525fbb 100644
--- a/ip/iptunnel.c
+++ b/ip/iptunnel.c
@@ -365,7 +365,7 @@ static void print_tunnel(struct ip_tunnel_parm *p)
 	if (!(p->iph.frag_off&htons(IP_DF)))
 		printf(" nopmtudisc");
 
-	if (!tnl_ioctl_get_6rd(p->name, &ip6rd) && ip6rd.prefixlen) {
+	if (p->iph.protocol == IPPROTO_IPV6 && !tnl_ioctl_get_6rd(p->name, &ip6rd) && ip6rd.prefixlen) {
 		printf(" 6rd-prefix %s/%u ",
 		       inet_ntop(AF_INET6, &ip6rd.prefix, s1, sizeof(s1)),
 		       ip6rd.prefixlen);
diff --git a/ip/tunnel.c b/ip/tunnel.c
index d389e86..6efbd2d 100644
--- a/ip/tunnel.c
+++ b/ip/tunnel.c
@@ -26,6 +26,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
+#include <errno.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
@@ -168,7 +169,7 @@ int tnl_del_ioctl(const char *basedev, const char *name, void *p)
 	return err;
 }
 
-static int tnl_gen_ioctl(int cmd, const char *name, void *p)
+static int tnl_gen_ioctl(int cmd, const char *name, void *p, int skiperr)
 {
 	struct ifreq ifr;
 	int fd;
@@ -178,7 +179,7 @@ static int tnl_gen_ioctl(int cmd, const char *name, void *p)
 	ifr.ifr_ifru.ifru_data = p;
 	fd = socket(preferred_family, SOCK_DGRAM, 0);
 	err = ioctl(fd, cmd, &ifr);
-	if (err)
+	if (err && errno != skiperr)
 		perror("ioctl");
 	close(fd);
 	return err;
@@ -186,15 +187,15 @@ static int tnl_gen_ioctl(int cmd, const char *name, void *p)
 
 int tnl_prl_ioctl(int cmd, const char *name, void *p)
 {
-	return tnl_gen_ioctl(cmd, name, p);
+	return tnl_gen_ioctl(cmd, name, p, -1);
 }
 
 int tnl_6rd_ioctl(int cmd, const char *name, void *p)
 {
-	return tnl_gen_ioctl(cmd, name, p);
+	return tnl_gen_ioctl(cmd, name, p, -1);
 }
 
 int tnl_ioctl_get_6rd(const char *name, void *p)
 {
-	return tnl_gen_ioctl(SIOCGET6RD, name, p);
+	return tnl_gen_ioctl(SIOCGET6RD, name, p, EINVAL);
 }
-- 
1.6.3.3


^ permalink raw reply related

* Bug#575970: iproute2: silence errors about kernel missing 6rd on "ip tun show".
From: Alexandre Cassen @ 2010-04-04 20:23 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Andreas Henriksson, 575970, netdev
In-Reply-To: <20100331160642.75766629@s6510>

Hello,

On Wed, 31 Mar 2010, Stephen Hemminger wrote:
> On Wed, 31 Mar 2010 10:08:54 +0200
> Andreas Henriksson <andreas@fatal.se> wrote:
>
>> Hello!
>>
>> As reported in http://bugs.debian.org/575970 there is currently a warning
>> printed for every tunnel when using latest iproute2 on atleast <= 2.6.32
>> kernels (missing 6rd?!).
>>
>> The attached patch avoids perror when errno is EINVAL, which I assume
>> is the way to detect missing 6rd support. A better/cleaner
>> method to detect and avoid 6rd when there's no kernel support
>> is more then welcome.
>>
>> Regards,
>> Andreas Henriksson
>>
>
> I will wait (a little while) to see if Alexandre has a preferred alternative.

IMHO, the proper way to detect 6rd kernel missing support is to catch 
EINVAL, but not others errno since they might be usefull in some case.

Reading the code again, there is also a need to test tunnel protocol since 
6rd scope is ipv6/ip only.

I will send another email with a proposed patch to fix those 2 issues.

Regs,
Alexandre

^ permalink raw reply

* Re: small packets sent through ne2k-pci delayed
From: Stephen Hemminger @ 2010-04-04 17:23 UTC (permalink / raw)
  To: Florian Zumbiehl; +Cc: p gortmaker, netdev
In-Reply-To: <27697562.59751270401801591.JavaMail.root@tahiti.vyatta.com>


----- "Florian Zumbiehl" <florz@florz.de> wrote:

> Hi,
> 
> I noticed today that a 2.6.33 kernel with an ne2k-pci card of mine
> transmits small packets (those that result in frames < 61 bytes)
> only with some major delay - or more exactly, it seems that they are
> being transmitted only when the next packet to be transmitted comes
> along.
> 
> Now, this patch seems to fix it for me, but I am not that sure that
> that's
> how it should be fixed:
> 
> di

> I _think_ that this problem did not exist with 2.6.22-rc4 - but I
> didn't
> have a chance to (re-)test that yet, I just think that I did things in
> the
> past (when that kernel was still running on that machine) that would
> imply
> that the problem did not exist at the time ...
> 
> So, any suggestions what I should try, other than re-testing with
> 2.6.22-rc4 (some time soon, I can do that, too, just not now)?
> 
> What might be interesting to know: those 61-byte frames do actually
> arrive
> at the recipient as 61-byte frames ...
> 
> Florian
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

The standard correct way to do this is skb_padto()

^ permalink raw reply

* patch to improve x.25 throughput negotiation
From: John Hughes @ 2010-04-04 16:48 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 518 bytes --]

The current X.25 code has some bugs in throughput negotiation:

   1. It does negotiation in all cases, usually there is no need
   2. It incorrectly attempts to negotiate the throughput class in one
      direction only.  There are separate throughput classes for input
      and output and if either is negotiated both mist be negotiates.

This is bug https://bugzilla.kernel.org/show_bug.cgi?id=15681

This bug was first reported by Daniel Ferenci to the linux-x25 mailing 
list on 6/8/2004, but is still present.


[-- Attachment #2: throughput.patch --]
[-- Type: text/x-patch, Size: 3294 bytes --]

From: John Hughes <john@calva.com>
Subject: x.25 attempts to negotiate invalid throughput

The current (2.6.34) x.25 code doesn't seem to know that the X.25 throughput
facility includes two values, one for the required throughput outbound, one
for inbound.

This causes it to attempt to negotiate throughput 0x0A, which is throughput
9600 inbound and the illegal value "0" for inbound throughput.

Because of this some X.25 devices (e.g. Cisco 1600) refuse to connect to Linux
X.25.

The following patch fixes this behaviour.  Unless the user specifies a required
throughput it does not attempt to negotiate.  If the user does not specify
a throughput it accepts the suggestion of the remote X.25 system.  If the
user requests a throughput then it validates both the input and output
throughputs and correctly negotiates them with the remote end.

Signed-off-by: John Hughes <john@calva.com>

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 9796f3e..f391f61 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -553,7 +553,8 @@ static int x25_create(struct net *net, struct socket *sock, int protocol,
 	x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
 	x25->facilities.pacsize_in  = X25_DEFAULT_PACKET_SIZE;
 	x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE;
-	x25->facilities.throughput  = X25_DEFAULT_THROUGHPUT;
+	x25->facilities.throughput  = 0;	/* by default don't negotiate
+						   throughput */
 	x25->facilities.reverse     = X25_DEFAULT_REVERSE;
 	x25->dte_facilities.calling_len = 0;
 	x25->dte_facilities.called_len = 0;
@@ -1414,9 +1415,20 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			if (facilities.winsize_in < 1 ||
 			    facilities.winsize_in > 127)
 				break;
-			if (facilities.throughput < 0x03 ||
-			    facilities.throughput > 0xDD)
-				break;
+			if (facilities.throughput) {
+				int out = facilities.throughput & 0xf0;
+				int in  = facilities.throughput & 0x0f;
+				if (!out)
+					facilities.throughput |=
+						X25_DEFAULT_THROUGHPUT << 4;
+				else if (out < 0x30 || out > 0xD0)
+					break;
+				if (!in)
+					facilities.throughput |=
+						X25_DEFAULT_THROUGHPUT;
+				else if (in < 0x03 || in > 0x0D)
+					break;
+			}
 			if (facilities.reverse &&
 				(facilities.reverse & 0x81) != 0x81)
 				break;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21f664..b447a66 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -259,9 +259,18 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
 	new->reverse = theirs.reverse;
 
 	if (theirs.throughput) {
-		if (theirs.throughput < ours->throughput) {
-			SOCK_DEBUG(sk, "X.25: throughput negotiated down\n");
-			new->throughput = theirs.throughput;
+		int theirs_in =  theirs.throughput & 0x0f;
+		int theirs_out = theirs.throughput & 0xf0;
+		int ours_in  = ours->throughput & 0x0f;
+		int ours_out = ours->throughput & 0xf0;
+		if (!ours_in || theirs_in < ours_in) {
+			SOCK_DEBUG(sk, "X.25: inbound throughput negotiated\n");
+			new->throughput = (new->throughput & 0xf0) | theirs_in;
+		}
+		if (!ours_out || theirs_out < ours_out) {
+			SOCK_DEBUG(sk,
+				"X.25: outbound throughput negotiated\n");
+			new->throughput = (new->throughput & 0x0f) | theirs_out;
 		}
 	}
 

^ permalink raw reply related

* Patch to fix kernel bug 15678 - x25 code accesses fields beyond the end of packet.
From: John Hughes @ 2010-04-04 16:40 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 257 bytes --]

The current X.25 code attempts to decode fields in X.25 packets that are 
not present.  Here is a little patch that checks the received packet 
length before attempting to decode the missing fields.  It also improves 
error checking for malformed packets.


[-- Attachment #2: x25-overrun.patch --]
[-- Type: text/x-patch, Size: 4972 bytes --]

From: John Hughes <john@calva.com>
Subject: Patch to fix bug 15678 - x25 accesses fields beyond end of packet.

Here is a patch to stop X.25 examining fields beyond the end of the packet.

For example, when a simple CALL ACCEPTED was received:

	10 10 0f

x25_parse_facilities was attempting to decode the FACILITIES field, but this
packet contains no facilities field.

Signed-off-by: John Hughes <john@calva.com>

diff --git a/include/net/x25.h b/include/net/x25.h
index 9baa07d..33f67fb 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -182,6 +182,10 @@ extern int  sysctl_x25_clear_request_timeout;
 extern int  sysctl_x25_ack_holdback_timeout;
 extern int  sysctl_x25_forward;
 
+extern int x25_parse_address_block(struct sk_buff *skb,
+		struct x25_address *called_addr,
+		struct x25_address *calling_addr);
+
 extern int  x25_addr_ntoa(unsigned char *, struct x25_address *,
 			  struct x25_address *);
 extern int  x25_addr_aton(unsigned char *, struct x25_address *,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 9796f3e..fe26c01 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -82,6 +82,41 @@ struct compat_x25_subscrip_struct {
 };
 #endif
 
+
+int x25_parse_address_block(struct sk_buff *skb,
+		struct x25_address *called_addr,
+		struct x25_address *calling_addr)
+{
+	unsigned char len;
+	int needed;
+	int rc;
+
+	if (skb->len < 1) {
+		/* packet has no address block */
+		rc = 0;
+		goto empty;
+	}
+
+	len = *skb->data;
+	needed = 1 + (len >> 4) + (len & 0x0f);
+
+	if (skb->len < needed) {
+		/* packet is too short to hold the addresses it claims
+		   to hold */
+		rc = -1;
+		goto empty;
+	}
+
+	return x25_addr_ntoa(skb->data, called_addr, calling_addr);
+
+empty:
+	*called_addr->x25_addr = 0;
+	*calling_addr->x25_addr = 0;
+
+	return rc;
+}
+
+
 int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr,
 		  struct x25_address *calling_addr)
 {
@@ -921,16 +956,26 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	/*
 	 *	Extract the X.25 addresses and convert them to ASCII strings,
 	 *	and remove them.
+	 *
+	 *	Address block is mandatory in call request packets
 	 */
-	addr_len = x25_addr_ntoa(skb->data, &source_addr, &dest_addr);
+	addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr);
+	if (addr_len <= 0)
+		goto out_clear_request;
 	skb_pull(skb, addr_len);
 
 	/*
 	 *	Get the length of the facilities, skip past them for the moment
 	 *	get the call user data because this is needed to determine
 	 *	the correct listener
+	 *
+	 *	Facilities length is mandatory in call request packets
 	 */
+	if (skb->len < 1)
+		goto out_clear_request;
 	len = skb->data[0] + 1;
+	if (skb->len < len)
+		goto out_clear_request;
 	skb_pull(skb,len);
 
 	/*
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21f664..a2765c6 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -35,7 +35,7 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
 		struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask)
 {
 	unsigned char *p = skb->data;
-	unsigned int len = *p++;
+	unsigned int len;
 
 	*vc_fac_mask = 0;
 
@@ -50,6 +50,14 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
 	memset(dte_facs->called_ae, '\0', sizeof(dte_facs->called_ae));
 	memset(dte_facs->calling_ae, '\0', sizeof(dte_facs->calling_ae));
 
+	if (skb->len < 1)
+		return 0;
+
+	len = *p++;
+
+	if (len >= skb->len)
+		return -1;
+
 	while (len > 0) {
 		switch (*p & X25_FAC_CLASS_MASK) {
 		case X25_FAC_CLASS_A:
@@ -247,6 +255,8 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
 	memcpy(new, ours, sizeof(*new));
 
 	len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask);
+	if (len < 0)
+		return len;
 
 	/*
 	 *	They want reverse charging, we won't accept it.
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index 96d9227..b39072f 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -89,6 +89,7 @@ static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
 static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype)
 {
 	struct x25_address source_addr, dest_addr;
+	int len;
 
 	switch (frametype) {
 		case X25_CALL_ACCEPTED: {
@@ -106,11 +107,17 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 			 *	Parse the data in the frame.
 			 */
 			skb_pull(skb, X25_STD_MIN_LEN);
-			skb_pull(skb, x25_addr_ntoa(skb->data, &source_addr, &dest_addr));
-			skb_pull(skb,
-				 x25_parse_facilities(skb, &x25->facilities,
+
+			len = x25_parse_address_block(skb, &source_addr,
+						&dest_addr);
+			if (len > 0)
+				skb_pull(skb, len);
+
+			len = x25_parse_facilities(skb, &x25->facilities,
 						&x25->dte_facilities,
-						&x25->vc_facil_mask));
+						&x25->vc_facil_mask);
+			if (len > 0)
+				skb_pull(skb, len);
 			/*
 			 *	Copy any Call User Data.
 			 */

^ permalink raw reply related

* [PATCH] 3c503: Fix IRQ probing
From: Ben Hutchings @ 2010-04-04 16:33 UTC (permalink / raw)
  To: Paul Gortmaker; +Cc: netdev, 566522, Piotr Skólski

The driver attempts to select an IRQ for the NIC automatically by
testing which of the supported IRQs are available and then probing
each available IRQ with probe_irq_{on,off}().  There are obvious race
conditions here, besides which:
1. The test for availability is done by passing a NULL handler, which
   now always returns -EINVAL, thus the device cannot be opened:
   <http://bugs.debian.org/566522>
2. probe_irq_off() will report only the first ISA IRQ handled,
   potentially leading to a false negative.

There was another bug that meant it ignored all error codes from
request_irq() except -EBUSY, so it would 'succeed' despite this
(possibly causing conflicts with other ISA devices).  This was fixed
by ab08999d6029bb2c79c16be5405d63d2bedbdfea 'WARNING: some
request_irq() failures ignored in el2_open()', which exposed bug 1.

This patch:
1. Replaces the use of probe_irq_{on,off}() with a real interrupt handler
2. Adds a delay before checking the interrupt-seen flag
3. Disables interrupts on all failure paths
4. Distinguishes error codes from the second request_irq() call,
   consistently with the first

Compile-tested only.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 drivers/net/3c503.c |   42 ++++++++++++++++++++++++++++++------------
 1 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/drivers/net/3c503.c b/drivers/net/3c503.c
index 66e0323..b74a0ea 100644
--- a/drivers/net/3c503.c
+++ b/drivers/net/3c503.c
@@ -380,6 +380,12 @@ out:
     return retval;
 }

+static irqreturn_t el2_probe_interrupt(int irq, void *seen)
+{
+	*(bool *)seen = true;
+	return IRQ_HANDLED;
+}
+
 static int
 el2_open(struct net_device *dev)
 {
@@ -391,23 +397,35 @@ el2_open(struct net_device *dev)

 	outb(EGACFR_NORM, E33G_GACFR);	/* Enable RAM and interrupts. */
 	do {
-	    retval = request_irq(*irqp, NULL, 0, "bogus", dev);
-	    if (retval >= 0) {
+		bool seen;
+
+		retval = request_irq(*irqp, el2_probe_interrupt, 0,
+				     dev->name, &seen);
+		if (retval == -EBUSY)
+			continue;
+		if (retval < 0)
+			goto err_disable;
+
 		/* Twinkle the interrupt, and check if it's seen. */
-		unsigned long cookie = probe_irq_on();
+		seen = false;
+		smp_wmb();
 		outb_p(0x04 << ((*irqp == 9) ? 2 : *irqp), E33G_IDCFR);
 		outb_p(0x00, E33G_IDCFR);
-		if (*irqp == probe_irq_off(cookie) &&	/* It's a good IRQ line! */
-		    ((retval = request_irq(dev->irq = *irqp,
-					   eip_interrupt, 0,
-					   dev->name, dev)) == 0))
-		    break;
-	    } else {
-		    if (retval != -EBUSY)
-			    return retval;
-	    }
+		msleep(1);
+		free_irq(*irqp, el2_probe_interrupt);
+		if (!seen)
+			continue;
+
+		retval = request_irq(dev->irq = *irqp, eip_interrupt, 0,
+				     dev->name, dev);
+		if (retval == -EBUSY)
+			continue;
+		if (retval < 0)
+			goto err_disable;
 	} while (*++irqp);
+
 	if (*irqp == 0) {
+	err_disable:
 	    outb(EGACFR_IRQOFF, E33G_GACFR);	/* disable interrupts. */
 	    return -EAGAIN;
 	}
-- 
1.7.0.3

^ permalink raw reply related

* [PATCH] myri10ge: fix rx_pause in myri10ge_set_pauseparam
From: Brice Goglin @ 2010-04-04 15:19 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Network Development list, Andrew J. Gallatin

Fix rx_pause in myri10ge_set_pauseparam().

Signed-off-by: Brice Goglin <brice@myri.com>

diff --git a/drivers/net/myri10ge/myri10ge.c
b/drivers/net/myri10ge/myri10ge.c
index e84dd3e..3cb7607 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1689,7 +1689,7 @@ myri10ge_set_pauseparam(struct net_device *netdev,
     if (pause->tx_pause != mgp->pause)
         return myri10ge_change_pause(mgp, pause->tx_pause);
     if (pause->rx_pause != mgp->pause)
-        return myri10ge_change_pause(mgp, pause->tx_pause);
+        return myri10ge_change_pause(mgp, pause->rx_pause);
     if (pause->autoneg != 0)
         return -EINVAL;
     return 0;


^ permalink raw reply related

* small packets sent through ne2k-pci delayed
From: Florian Zumbiehl @ 2010-04-04 14:18 UTC (permalink / raw)
  To: p_gortmaker, netdev

Hi,

I noticed today that a 2.6.33 kernel with an ne2k-pci card of mine
transmits small packets (those that result in frames < 61 bytes)
only with some major delay - or more exactly, it seems that they are
being transmitted only when the next packet to be transmitted comes along.

Now, this patch seems to fix it for me, but I am not that sure that that's
how it should be fixed:

diff --git a/drivers/net/lib8390.c b/drivers/net/lib8390.c
index 57f2584..ccef77c 100644
--- a/drivers/net/lib8390.c
+++ b/drivers/net/lib8390.c
@@ -306,13 +306,13 @@ static netdev_tx_t __ei_start_xmit(struct sk_buff *skb,
 	struct ei_device *ei_local = (struct ei_device *) netdev_priv(dev);
 	int send_length = skb->len, output_page;
 	unsigned long flags;
-	char buf[ETH_ZLEN];
+	char buf[ETH_ZLEN+1];
 	char *data = skb->data;

-	if (skb->len < ETH_ZLEN) {
-		memset(buf, 0, ETH_ZLEN);	/* more efficient than doing just the needed bits */
+	if (skb->len < ETH_ZLEN+1) {
+		memset(buf, 0, ETH_ZLEN+1);	/* more efficient than doing just the needed bits */
 		memcpy(buf, data, skb->len);
-		send_length = ETH_ZLEN;
+		send_length = ETH_ZLEN+1;
 		data = buf;
 	}

I _think_ that this problem did not exist with 2.6.22-rc4 - but I didn't
have a chance to (re-)test that yet, I just think that I did things in the
past (when that kernel was still running on that machine) that would imply
that the problem did not exist at the time ...

So, any suggestions what I should try, other than re-testing with
2.6.22-rc4 (some time soon, I can do that, too, just not now)?

What might be interesting to know: those 61-byte frames do actually arrive
at the recipient as 61-byte frames ...

Florian

^ permalink raw reply related

* [PATCHv2] virtio-net: move sg off stack
From: Michael S. Tsirkin @ 2010-04-04 13:07 UTC (permalink / raw)
  To: David S. Miller, Rusty Russell, Jiri Pirko, Michael S. Tsirkin,
	Shirley Ma, ne

Move sg structure off stack and into virtnet_info structure.
This helps remove extra sg_init_table calls as well as reduce
stack usage.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Michael S. Tsirkin <mst@redhat.com>
---

Changes from v1: fix compilation of add_recvbuf_mergeable (&rx_sg -> rx_sg)

This patch works for me. Shirley, could you find the time to test
this as well please?

 drivers/net/virtio_net.c |   52 ++++++++++++++++++++++-----------------------
 1 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3f5be35..186dd6a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -39,8 +39,7 @@ module_param(gso, bool, 0444);
 
 #define VIRTNET_SEND_COMMAND_SG_MAX    2
 
-struct virtnet_info
-{
+struct virtnet_info {
 	struct virtio_device *vdev;
 	struct virtqueue *rvq, *svq, *cvq;
 	struct net_device *dev;
@@ -61,6 +60,10 @@ struct virtnet_info
 
 	/* Chain pages by the private ptr. */
 	struct page *pages;
+
+	/* fragments + linear part + virtio header */
+	struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
+	struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
 };
 
 struct skb_vnet_hdr {
@@ -323,10 +326,8 @@ static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
 {
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
-	struct scatterlist sg[2];
 	int err;
 
-	sg_init_table(sg, 2);
 	skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN);
 	if (unlikely(!skb))
 		return -ENOMEM;
@@ -334,11 +335,11 @@ static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
 	skb_put(skb, MAX_PACKET_LEN);
 
 	hdr = skb_vnet_hdr(skb);
-	sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
+	sg_set_buf(vi->rx_sg, &hdr->hdr, sizeof hdr->hdr);
 
-	skb_to_sgvec(skb, sg + 1, 0, skb->len);
+	skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len);
 
-	err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
+	err = vi->rvq->vq_ops->add_buf(vi->rvq, vi->rx_sg, 0, 2, skb);
 	if (err < 0)
 		dev_kfree_skb(skb);
 
@@ -347,13 +348,11 @@ static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
 
 static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
 {
-	struct scatterlist sg[MAX_SKB_FRAGS + 2];
 	struct page *first, *list = NULL;
 	char *p;
 	int i, err, offset;
 
-	sg_init_table(sg, MAX_SKB_FRAGS + 2);
-	/* page in sg[MAX_SKB_FRAGS + 1] is list tail */
+	/* page in vi->rx_sg[MAX_SKB_FRAGS + 1] is list tail */
 	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
 		first = get_a_page(vi, gfp);
 		if (!first) {
@@ -361,7 +360,7 @@ static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
 				give_pages(vi, list);
 			return -ENOMEM;
 		}
-		sg_set_buf(&sg[i], page_address(first), PAGE_SIZE);
+		sg_set_buf(&vi->rx_sg[i], page_address(first), PAGE_SIZE);
 
 		/* chain new page in list head to match sg */
 		first->private = (unsigned long)list;
@@ -375,17 +374,17 @@ static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
 	}
 	p = page_address(first);
 
-	/* sg[0], sg[1] share the same page */
-	/* a separated sg[0] for  virtio_net_hdr only during to QEMU bug*/
-	sg_set_buf(&sg[0], p, sizeof(struct virtio_net_hdr));
+	/* vi->rx_sg[0], vi->rx_sg[1] share the same page */
+	/* a separated vi->rx_sg[0] for virtio_net_hdr only due to QEMU bug */
+	sg_set_buf(&vi->rx_sg[0], p, sizeof(struct virtio_net_hdr));
 
-	/* sg[1] for data packet, from offset */
+	/* vi->rx_sg[1] for data packet, from offset */
 	offset = sizeof(struct padded_vnet_hdr);
-	sg_set_buf(&sg[1], p + offset, PAGE_SIZE - offset);
+	sg_set_buf(&vi->rx_sg[1], p + offset, PAGE_SIZE - offset);
 
 	/* chain first in list head */
 	first->private = (unsigned long)list;
-	err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, MAX_SKB_FRAGS + 2,
+	err = vi->rvq->vq_ops->add_buf(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2,
 				       first);
 	if (err < 0)
 		give_pages(vi, first);
@@ -396,16 +395,15 @@ static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
 static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
 {
 	struct page *page;
-	struct scatterlist sg;
 	int err;
 
 	page = get_a_page(vi, gfp);
 	if (!page)
 		return -ENOMEM;
 
-	sg_init_one(&sg, page_address(page), PAGE_SIZE);
+	sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE);
 
-	err = vi->rvq->vq_ops->add_buf(vi->rvq, &sg, 0, 1, page);
+	err = vi->rvq->vq_ops->add_buf(vi->rvq, vi->rx_sg, 0, 1, page);
 	if (err < 0)
 		give_pages(vi, page);
 
@@ -514,12 +512,9 @@ static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
 
 static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 {
-	struct scatterlist sg[2+MAX_SKB_FRAGS];
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
 
-	sg_init_table(sg, 2+MAX_SKB_FRAGS);
-
 	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -553,12 +548,13 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 
 	/* Encode metadata header at front. */
 	if (vi->mergeable_rx_bufs)
-		sg_set_buf(sg, &hdr->mhdr, sizeof hdr->mhdr);
+		sg_set_buf(vi->tx_sg, &hdr->mhdr, sizeof hdr->mhdr);
 	else
-		sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
+		sg_set_buf(vi->tx_sg, &hdr->hdr, sizeof hdr->hdr);
 
-	hdr->num_sg = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
-	return vi->svq->vq_ops->add_buf(vi->svq, sg, hdr->num_sg, 0, skb);
+	hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1;
+	return vi->svq->vq_ops->add_buf(vi->svq, vi->tx_sg, hdr->num_sg,
+					0, skb);
 }
 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -941,6 +937,8 @@ static int virtnet_probe(struct virtio_device *vdev)
 	vdev->priv = vi;
 	vi->pages = NULL;
 	INIT_DELAYED_WORK(&vi->refill, refill_work);
+	sg_init_table(vi->rx_sg, ARRAY_SIZE(vi->rx_sg));
+	sg_init_table(vi->tx_sg, ARRAY_SIZE(vi->tx_sg));
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
-- 
1.7.0.2.280.gc6f05

^ permalink raw reply related

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: Timo Teräs @ 2010-04-04 12:09 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev
In-Reply-To: <20100404113142.GA11124@gondor.apana.org.au>

Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 07:26:36PM +0800, Herbert Xu wrote:
>> Fine, move key into flow_cache_entry but the rest should still
>> work, no?
>
> OK this doesn't work either as we still have NULL objects for
> now.  But I still think even if the ops pointer is the only
> member in flow_cache_object, it looks better than returning the
> nested ops pointer directly from flow_cache_lookup.

Yes, it'll look better. I'll wrap the pointer in a struct.

Ok, so far it's:
 - constify ops
 - indentation fixes for flow.c struct's with pointer members
 - wrap ops* in a struct* to avoid ops**

Will fix and resend refreshed patches tomorrow.

Thanks.

^ permalink raw reply

* RE: [PATCH] bnx2x: use the dma state API instead of the pci equivalents
From: FUJITA Tomonori @ 2010-04-04 11:51 UTC (permalink / raw)
  To: vladz; +Cc: fujita.tomonori, davem, netdev, eilong
In-Reply-To: <8628FE4E7912BF47A96AE7DD7BAC0AADDDC525ADDD@SJEXCHCCR02.corp.ad.broadcom.com>

On Sun, 4 Apr 2010 03:24:46 -0700
"Vladislav Zolotarov" <vladz@broadcom.com> wrote:

> Ok. Got it now. Thanks, Fujita. I think we should patch the bnx2x to
> use the generic model (not just the mapping macros).

I've attached the patch.

There is one functional change: pci_alloc_consistent ->
dma_alloc_coherent

pci_alloc_consistent is a wrapper function of dma_alloc_coherent with
GFP_ATOMIC flag (see include/asm-generic/pci-dma-compat.h).

pci_alloc_consistent uses GFP_ATOMIC flag because of the compatibility
for some broken drivers that use the function in interrupt. But
GFP_ATOMIC should be avoided if possible. Looks like bnx2x doesn't use
pci_alloc_consistent in interrupt so I replaced them with
dma_alloc_coherent with GFP_KERNEL.

Please check if that change works for bnx2x.

> One last question: since which kernel version the generic DMA layer
> may be used instead of PCI DMA layer?

After 2.6.34-rc2.

Well, on the majority of architectures, you have been able to use the
generic DMA API over the PCI DMA API. The PCI DMA API is just the
wrapper of the generic DMA API. But on some architectures, two APIs
worked differently a bit. since 2.6.34-rc2, two API work in the exact
same way on all the architectures.


=
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Subject: [PATCH] bnx2x: use the DMA API instead of the pci equivalents

The DMA API is preferred.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
---
 drivers/net/bnx2x.h      |    4 +-
 drivers/net/bnx2x_main.c |  110 +++++++++++++++++++++++----------------------
 2 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
index 3c48a7a..ae9c89e 100644
--- a/drivers/net/bnx2x.h
+++ b/drivers/net/bnx2x.h
@@ -163,7 +163,7 @@ do {								 \
 
 struct sw_rx_bd {
 	struct sk_buff	*skb;
-	DECLARE_PCI_UNMAP_ADDR(mapping)
+	DEFINE_DMA_UNMAP_ADDR(mapping);
 };
 
 struct sw_tx_bd {
@@ -176,7 +176,7 @@ struct sw_tx_bd {
 
 struct sw_rx_page {
 	struct page	*page;
-	DECLARE_PCI_UNMAP_ADDR(mapping)
+	DEFINE_DMA_UNMAP_ADDR(mapping);
 };
 
 union db_prod {
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index fa9275c..63a17d6 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -842,7 +842,7 @@ static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	/* unmap first bd */
 	DP(BNX2X_MSG_OFF, "free bd_idx %d\n", bd_idx);
 	tx_start_bd = &fp->tx_desc_ring[bd_idx].start_bd;
-	pci_unmap_single(bp->pdev, BD_UNMAP_ADDR(tx_start_bd),
+	dma_unmap_single(&bp->pdev->dev, BD_UNMAP_ADDR(tx_start_bd),
 			 BD_UNMAP_LEN(tx_start_bd), PCI_DMA_TODEVICE);
 
 	nbd = le16_to_cpu(tx_start_bd->nbd) - 1;
@@ -872,8 +872,8 @@ static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 
 		DP(BNX2X_MSG_OFF, "free frag bd_idx %d\n", bd_idx);
 		tx_data_bd = &fp->tx_desc_ring[bd_idx].reg_bd;
-		pci_unmap_page(bp->pdev, BD_UNMAP_ADDR(tx_data_bd),
-			       BD_UNMAP_LEN(tx_data_bd), PCI_DMA_TODEVICE);
+		dma_unmap_page(&bp->pdev->dev, BD_UNMAP_ADDR(tx_data_bd),
+			       BD_UNMAP_LEN(tx_data_bd), DMA_TO_DEVICE);
 		if (--nbd)
 			bd_idx = TX_BD(NEXT_TX_IDX(bd_idx));
 	}
@@ -1086,7 +1086,7 @@ static inline void bnx2x_free_rx_sge(struct bnx2x *bp,
 	if (!page)
 		return;
 
-	pci_unmap_page(bp->pdev, pci_unmap_addr(sw_buf, mapping),
+	dma_unmap_page(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
 		       SGE_PAGE_SIZE*PAGES_PER_SGE, PCI_DMA_FROMDEVICE);
 	__free_pages(page, PAGES_PER_SGE_SHIFT);
 
@@ -1115,15 +1115,15 @@ static inline int bnx2x_alloc_rx_sge(struct bnx2x *bp,
 	if (unlikely(page == NULL))
 		return -ENOMEM;
 
-	mapping = pci_map_page(bp->pdev, page, 0, SGE_PAGE_SIZE*PAGES_PER_SGE,
-			       PCI_DMA_FROMDEVICE);
+	mapping = dma_map_page(&bp->pdev->dev, page, 0,
+			       SGE_PAGE_SIZE*PAGES_PER_SGE, DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
 		__free_pages(page, PAGES_PER_SGE_SHIFT);
 		return -ENOMEM;
 	}
 
 	sw_buf->page = page;
-	pci_unmap_addr_set(sw_buf, mapping, mapping);
+	dma_unmap_addr_set(sw_buf, mapping, mapping);
 
 	sge->addr_hi = cpu_to_le32(U64_HI(mapping));
 	sge->addr_lo = cpu_to_le32(U64_LO(mapping));
@@ -1143,15 +1143,15 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp,
 	if (unlikely(skb == NULL))
 		return -ENOMEM;
 
-	mapping = pci_map_single(bp->pdev, skb->data, bp->rx_buf_size,
-				 PCI_DMA_FROMDEVICE);
+	mapping = dma_map_single(&bp->pdev->dev, skb->data, bp->rx_buf_size,
+				 DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
 		dev_kfree_skb(skb);
 		return -ENOMEM;
 	}
 
 	rx_buf->skb = skb;
-	pci_unmap_addr_set(rx_buf, mapping, mapping);
+	dma_unmap_addr_set(rx_buf, mapping, mapping);
 
 	rx_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
 	rx_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
@@ -1173,13 +1173,13 @@ static void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp,
 	struct eth_rx_bd *cons_bd = &fp->rx_desc_ring[cons];
 	struct eth_rx_bd *prod_bd = &fp->rx_desc_ring[prod];
 
-	pci_dma_sync_single_for_device(bp->pdev,
-				       pci_unmap_addr(cons_rx_buf, mapping),
-				       RX_COPY_THRESH, PCI_DMA_FROMDEVICE);
+	dma_sync_single_for_device(&bp->pdev->dev,
+				   dma_unmap_addr(cons_rx_buf, mapping),
+				   RX_COPY_THRESH, DMA_FROM_DEVICE);
 
 	prod_rx_buf->skb = cons_rx_buf->skb;
-	pci_unmap_addr_set(prod_rx_buf, mapping,
-			   pci_unmap_addr(cons_rx_buf, mapping));
+	dma_unmap_addr_set(prod_rx_buf, mapping,
+			   dma_unmap_addr(cons_rx_buf, mapping));
 	*prod_bd = *cons_bd;
 }
 
@@ -1283,9 +1283,9 @@ static void bnx2x_tpa_start(struct bnx2x_fastpath *fp, u16 queue,
 
 	/* move empty skb from pool to prod and map it */
 	prod_rx_buf->skb = fp->tpa_pool[queue].skb;
-	mapping = pci_map_single(bp->pdev, fp->tpa_pool[queue].skb->data,
-				 bp->rx_buf_size, PCI_DMA_FROMDEVICE);
-	pci_unmap_addr_set(prod_rx_buf, mapping, mapping);
+	mapping = dma_map_single(&bp->pdev->dev, fp->tpa_pool[queue].skb->data,
+				 bp->rx_buf_size, DMA_FROM_DEVICE);
+	dma_unmap_addr_set(prod_rx_buf, mapping, mapping);
 
 	/* move partial skb from cons to pool (don't unmap yet) */
 	fp->tpa_pool[queue] = *cons_rx_buf;
@@ -1361,8 +1361,9 @@ static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 		}
 
 		/* Unmap the page as we r going to pass it to the stack */
-		pci_unmap_page(bp->pdev, pci_unmap_addr(&old_rx_pg, mapping),
-			      SGE_PAGE_SIZE*PAGES_PER_SGE, PCI_DMA_FROMDEVICE);
+		dma_unmap_page(&bp->pdev->dev,
+			       dma_unmap_addr(&old_rx_pg, mapping),
+			       SGE_PAGE_SIZE*PAGES_PER_SGE, DMA_FROM_DEVICE);
 
 		/* Add one frag and update the appropriate fields in the skb */
 		skb_fill_page_desc(skb, j, old_rx_pg.page, 0, frag_len);
@@ -1389,8 +1390,8 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	/* Unmap skb in the pool anyway, as we are going to change
 	   pool entry status to BNX2X_TPA_STOP even if new skb allocation
 	   fails. */
-	pci_unmap_single(bp->pdev, pci_unmap_addr(rx_buf, mapping),
-			 bp->rx_buf_size, PCI_DMA_FROMDEVICE);
+	dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping),
+			 bp->rx_buf_size, DMA_FROM_DEVICE);
 
 	if (likely(new_skb)) {
 		/* fix ip xsum and give it to the stack */
@@ -1620,10 +1621,10 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 				}
 			}
 
-			pci_dma_sync_single_for_device(bp->pdev,
-					pci_unmap_addr(rx_buf, mapping),
-						       pad + RX_COPY_THRESH,
-						       PCI_DMA_FROMDEVICE);
+			dma_sync_single_for_device(&bp->pdev->dev,
+					dma_unmap_addr(rx_buf, mapping),
+						   pad + RX_COPY_THRESH,
+						   DMA_FROM_DEVICE);
 			prefetch(skb);
 			prefetch(((char *)(skb)) + 128);
 
@@ -1665,10 +1666,10 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 
 			} else
 			if (likely(bnx2x_alloc_rx_skb(bp, fp, bd_prod) == 0)) {
-				pci_unmap_single(bp->pdev,
-					pci_unmap_addr(rx_buf, mapping),
+				dma_unmap_single(&bp->pdev->dev,
+					dma_unmap_addr(rx_buf, mapping),
 						 bp->rx_buf_size,
-						 PCI_DMA_FROMDEVICE);
+						 DMA_FROM_DEVICE);
 				skb_reserve(skb, pad);
 				skb_put(skb, len);
 
@@ -4940,9 +4941,9 @@ static inline void bnx2x_free_tpa_pool(struct bnx2x *bp,
 		}
 
 		if (fp->tpa_state[i] == BNX2X_TPA_START)
-			pci_unmap_single(bp->pdev,
-					 pci_unmap_addr(rx_buf, mapping),
-					 bp->rx_buf_size, PCI_DMA_FROMDEVICE);
+			dma_unmap_single(&bp->pdev->dev,
+					 dma_unmap_addr(rx_buf, mapping),
+					 bp->rx_buf_size, DMA_FROM_DEVICE);
 
 		dev_kfree_skb(skb);
 		rx_buf->skb = NULL;
@@ -4978,7 +4979,7 @@ static void bnx2x_init_rx_rings(struct bnx2x *bp)
 					fp->disable_tpa = 1;
 					break;
 				}
-				pci_unmap_addr_set((struct sw_rx_bd *)
+				dma_unmap_addr_set((struct sw_rx_bd *)
 							&bp->fp->tpa_pool[i],
 						   mapping, 0);
 				fp->tpa_state[i] = BNX2X_TPA_STOP;
@@ -5658,8 +5659,8 @@ static void bnx2x_nic_init(struct bnx2x *bp, u32 load_code)
 
 static int bnx2x_gunzip_init(struct bnx2x *bp)
 {
-	bp->gunzip_buf = pci_alloc_consistent(bp->pdev, FW_BUF_SIZE,
-					      &bp->gunzip_mapping);
+	bp->gunzip_buf = dma_alloc_coherent(&bp->pdev->dev, FW_BUF_SIZE,
+					    &bp->gunzip_mapping, GFP_KERNEL);
 	if (bp->gunzip_buf  == NULL)
 		goto gunzip_nomem1;
 
@@ -5679,8 +5680,8 @@ gunzip_nomem3:
 	bp->strm = NULL;
 
 gunzip_nomem2:
-	pci_free_consistent(bp->pdev, FW_BUF_SIZE, bp->gunzip_buf,
-			    bp->gunzip_mapping);
+	dma_free_coherent(&bp->pdev->dev, FW_BUF_SIZE, bp->gunzip_buf,
+			  bp->gunzip_mapping);
 	bp->gunzip_buf = NULL;
 
 gunzip_nomem1:
@@ -5696,8 +5697,8 @@ static void bnx2x_gunzip_end(struct bnx2x *bp)
 	bp->strm = NULL;
 
 	if (bp->gunzip_buf) {
-		pci_free_consistent(bp->pdev, FW_BUF_SIZE, bp->gunzip_buf,
-				    bp->gunzip_mapping);
+		dma_free_coherent(&bp->pdev->dev, FW_BUF_SIZE, bp->gunzip_buf,
+				  bp->gunzip_mapping);
 		bp->gunzip_buf = NULL;
 	}
 }
@@ -6692,7 +6693,7 @@ static void bnx2x_free_mem(struct bnx2x *bp)
 #define BNX2X_PCI_FREE(x, y, size) \
 	do { \
 		if (x) { \
-			pci_free_consistent(bp->pdev, size, x, y); \
+			dma_free_coherent(&bp->pdev->dev, size, x, y); \
 			x = NULL; \
 			y = 0; \
 		} \
@@ -6773,7 +6774,7 @@ static int bnx2x_alloc_mem(struct bnx2x *bp)
 
 #define BNX2X_PCI_ALLOC(x, y, size) \
 	do { \
-		x = pci_alloc_consistent(bp->pdev, size, y); \
+		x = dma_alloc_coherent(&bp->pdev->dev, size, y, GFP_KERNEL); \
 		if (x == NULL) \
 			goto alloc_mem_err; \
 		memset(x, 0, size); \
@@ -6906,9 +6907,9 @@ static void bnx2x_free_rx_skbs(struct bnx2x *bp)
 			if (skb == NULL)
 				continue;
 
-			pci_unmap_single(bp->pdev,
-					 pci_unmap_addr(rx_buf, mapping),
-					 bp->rx_buf_size, PCI_DMA_FROMDEVICE);
+			dma_unmap_single(&bp->pdev->dev,
+					 dma_unmap_addr(rx_buf, mapping),
+					 bp->rx_buf_size, DMA_FROM_DEVICE);
 
 			rx_buf->skb = NULL;
 			dev_kfree_skb(skb);
@@ -10269,8 +10270,8 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode, u8 link_up)
 
 	bd_prod = TX_BD(fp_tx->tx_bd_prod);
 	tx_start_bd = &fp_tx->tx_desc_ring[bd_prod].start_bd;
-	mapping = pci_map_single(bp->pdev, skb->data,
-				 skb_headlen(skb), PCI_DMA_TODEVICE);
+	mapping = dma_map_single(&bp->pdev->dev, skb->data,
+				 skb_headlen(skb), DMA_TO_DEVICE);
 	tx_start_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
 	tx_start_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
 	tx_start_bd->nbd = cpu_to_le16(2); /* start + pbd */
@@ -11316,8 +11317,8 @@ static netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 
-	mapping = pci_map_single(bp->pdev, skb->data,
-				 skb_headlen(skb), PCI_DMA_TODEVICE);
+	mapping = dma_map_single(&bp->pdev->dev, skb->data,
+				 skb_headlen(skb), DMA_TO_DEVICE);
 
 	tx_start_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
 	tx_start_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
@@ -11374,8 +11375,9 @@ static netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (total_pkt_bd == NULL)
 			total_pkt_bd = &fp->tx_desc_ring[bd_prod].reg_bd;
 
-		mapping = pci_map_page(bp->pdev, frag->page, frag->page_offset,
-				       frag->size, PCI_DMA_TODEVICE);
+		mapping = dma_map_page(&bp->pdev->dev, frag->page,
+				       frag->page_offset,
+				       frag->size, DMA_TO_DEVICE);
 
 		tx_data_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
 		tx_data_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
@@ -11832,15 +11834,15 @@ static int __devinit bnx2x_init_dev(struct pci_dev *pdev,
 		goto err_out_release;
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
+	if (dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) == 0) {
 		bp->flags |= USING_DAC_FLAG;
-		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
-			pr_err("pci_set_consistent_dma_mask failed, aborting\n");
+		if (dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)) != 0) {
+			pr_err("dma_set_coherent_mask failed, aborting\n");
 			rc = -EIO;
 			goto err_out_release;
 		}
 
-	} else if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
+	} else if (dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)) != 0) {
 		pr_err("System does not support DMA, aborting\n");
 		rc = -EIO;
 		goto err_out_release;
-- 
1.7.0


^ permalink raw reply related

* Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.
From: Michael S. Tsirkin @ 2010-04-04 11:40 UTC (permalink / raw)
  To: Xin, Xiaohui
  Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu, jdike@addtoit.com
In-Reply-To: <97F6D3BD476C464182C1B7BABF0B0AF5C17B55B5@shzsmsx502.ccr.corp.intel.com>

On Fri, Apr 02, 2010 at 10:16:16AM +0800, Xin, Xiaohui wrote:
> 
> >> For the write logging, do you have a function in hand that we can
> >> recompute the log? If that, I think I can use it to recompute the
> >>log info when the logging is suddenly enabled.
> >> For the outstanding requests, do you mean all the user buffers have
> >>submitted before the logging ioctl changed? That may be a lot, and
> >> some of them are still in NIC ring descriptors. Waiting them to be
> >>finished may be need some time. I think when logging ioctl changed,
> >> then the logging is changed just after that is also reasonable.
> 
> >The key point is that after loggin ioctl returns, any
> >subsequent change to memory must be logged. It does not
> >matter when was the request submitted, otherwise we will
> >get memory corruption on migration.
> 
> The change to memory happens when vhost_add_used_and_signal(), right?
> So after ioctl returns, just recompute the log info to the events in the async queue,
> is ok. Since the ioctl and write log operations are all protected by vq->mutex.
> 
> Thanks
> Xiaohui

Yes, I think this will work.

> > Thanks
> > Xiaohui
> > 
> >  drivers/vhost/net.c   |  189 +++++++++++++++++++++++++++++++++++++++++++++++--
> >  drivers/vhost/vhost.h |   10 +++
> >  2 files changed, 192 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 22d5fef..2aafd90 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -17,11 +17,13 @@
> >  #include <linux/workqueue.h>
> >  #include <linux/rcupdate.h>
> >  #include <linux/file.h>
> > +#include <linux/aio.h>
> >  
> >  #include <linux/net.h>
> >  #include <linux/if_packet.h>
> >  #include <linux/if_arp.h>
> >  #include <linux/if_tun.h>
> > +#include <linux/mpassthru.h>
> >  
> >  #include <net/sock.h>
> >  
> > @@ -47,6 +49,7 @@ struct vhost_net {
> >  	struct vhost_dev dev;
> >  	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> >  	struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > +	struct kmem_cache       *cache;
> >  	/* Tells us whether we are polling a socket for TX.
> >  	 * We only do this when socket buffer fills up.
> >  	 * Protected by tx vq lock. */
> > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
> >  	net->tx_poll_state = VHOST_NET_POLL_STARTED;
> >  }
> >  
> > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > +{
> > +	struct kiocb *iocb = NULL;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&vq->notify_lock, flags);
> > +	if (!list_empty(&vq->notifier)) {
> > +		iocb = list_first_entry(&vq->notifier,
> > +				struct kiocb, ki_list);
> > +		list_del(&iocb->ki_list);
> > +	}
> > +	spin_unlock_irqrestore(&vq->notify_lock, flags);
> > +	return iocb;
> > +}
> > +
> > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > +					struct vhost_virtqueue *vq)
> > +{
> > +	struct kiocb *iocb = NULL;
> > +	struct vhost_log *vq_log = NULL;
> > +	int rx_total_len = 0;
> > +	int log, size;
> > +
> > +	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > +		return;
> > +
> > +	if (vq->receiver)
> > +		vq->receiver(vq);
> > +
> > +	vq_log = unlikely(vhost_has_feature(
> > +				&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > +	while ((iocb = notify_dequeue(vq)) != NULL) {
> > +		vhost_add_used_and_signal(&net->dev, vq,
> > +				iocb->ki_pos, iocb->ki_nbytes);
> > +		log = (int)iocb->ki_user_data;
> > +		size = iocb->ki_nbytes;
> > +		rx_total_len += iocb->ki_nbytes;
> > +
> > +		if (iocb->ki_dtor)
> > +			iocb->ki_dtor(iocb);
> > +		kmem_cache_free(net->cache, iocb);
> > +
> > +		if (unlikely(vq_log))
> > +			vhost_log_write(vq, vq_log, log, size);
> > +		if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > +			vhost_poll_queue(&vq->poll);
> > +			break;
> > +		}
> > +	}
> > +}
> > +
> > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > +					struct vhost_virtqueue *vq)
> > +{
> > +	struct kiocb *iocb = NULL;
> > +	int tx_total_len = 0;
> > +
> > +	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > +		return;
> > +
> > +	while ((iocb = notify_dequeue(vq)) != NULL) {
> > +		vhost_add_used_and_signal(&net->dev, vq,
> > +				iocb->ki_pos, 0);
> > +		tx_total_len += iocb->ki_nbytes;
> > +
> > +		if (iocb->ki_dtor)
> > +			iocb->ki_dtor(iocb);
> > +
> > +		kmem_cache_free(net->cache, iocb);
> > +		if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> > +			vhost_poll_queue(&vq->poll);
> > +			break;
> > +		}
> > +	}
> > +}
> > +
> >  /* Expects to be always run from workqueue - which acts as
> >   * read-size critical section for our kind of RCU. */
> >  static void handle_tx(struct vhost_net *net)
> >  {
> >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> > +	struct kiocb *iocb = NULL;
> >  	unsigned head, out, in, s;
> >  	struct msghdr msg = {
> >  		.msg_name = NULL,
> > @@ -124,6 +204,8 @@ static void handle_tx(struct vhost_net *net)
> >  		tx_poll_stop(net);
> >  	hdr_size = vq->hdr_size;
> >  
> > +	handle_async_tx_events_notify(net, vq);
> > +
> >  	for (;;) {
> >  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> >  					 ARRAY_SIZE(vq->iov),
> > @@ -151,6 +233,15 @@ static void handle_tx(struct vhost_net *net)
> >  		/* Skip header. TODO: support TSO. */
> >  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> >  		msg.msg_iovlen = out;
> > +
> > +		if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > +			iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > +			if (!iocb)
> > +				break;
> > +			iocb->ki_pos = head;
> > +			iocb->private = (void *)vq;
> > +		}
> > +
> >  		len = iov_length(vq->iov, out);
> >  		/* Sanity check */
> >  		if (!len) {
> > @@ -160,12 +251,16 @@ static void handle_tx(struct vhost_net *net)
> >  			break;
> >  		}
> >  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> > -		err = sock->ops->sendmsg(NULL, sock, &msg, len);
> > +		err = sock->ops->sendmsg(iocb, sock, &msg, len);
> >  		if (unlikely(err < 0)) {
> >  			vhost_discard_vq_desc(vq);
> >  			tx_poll_start(net, sock);
> >  			break;
> >  		}
> > +
> > +		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > +			continue;
> > +
> >  		if (err != len)
> >  			pr_err("Truncated TX packet: "
> >  			       " len %d != %zd\n", err, len);
> > @@ -177,6 +272,8 @@ static void handle_tx(struct vhost_net *net)
> >  		}
> >  	}
> >  
> > +	handle_async_tx_events_notify(net, vq);
> > +
> >  	mutex_unlock(&vq->mutex);
> >  	unuse_mm(net->dev.mm);
> >  }
> > @@ -186,6 +283,7 @@ static void handle_tx(struct vhost_net *net)
> >  static void handle_rx(struct vhost_net *net)
> >  {
> >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > +	struct kiocb *iocb = NULL;
> >  	unsigned head, out, in, log, s;
> >  	struct vhost_log *vq_log;
> >  	struct msghdr msg = {
> > @@ -206,7 +304,8 @@ static void handle_rx(struct vhost_net *net)
> >  	int err;
> >  	size_t hdr_size;
> >  	struct socket *sock = rcu_dereference(vq->private_data);
> > -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > +	if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> > +			vq->link_state == VHOST_VQ_LINK_SYNC))
> >  		return;
> >  
> >  	use_mm(net->dev.mm);
> > @@ -214,9 +313,18 @@ static void handle_rx(struct vhost_net *net)
> >  	vhost_disable_notify(vq);
> >  	hdr_size = vq->hdr_size;
> >  
> > -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> > +	/* In async cases, for write logging, the simple way is to get
> > +	 * the log info always, and really logging is decided later.
> > +	 * Thus, when logging enabled, we can get log, and when logging
> > +	 * disabled, we can get log disabled accordingly.
> > +	 */
> > +
> > +	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
> > +		(vq->link_state == VHOST_VQ_LINK_ASYNC) ?
> >  		vq->log : NULL;
> >  
> > +	handle_async_rx_events_notify(net, vq);
> > +
> >  	for (;;) {
> >  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> >  					 ARRAY_SIZE(vq->iov),
> > @@ -245,6 +353,14 @@ static void handle_rx(struct vhost_net *net)
> >  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> >  		msg.msg_iovlen = in;
> >  		len = iov_length(vq->iov, in);
> > +		if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > +			iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > +			if (!iocb)
> > +				break;
> > +			iocb->private = vq;
> > +			iocb->ki_pos = head;
> > +			iocb->ki_user_data = log;
> > +		}
> >  		/* Sanity check */
> >  		if (!len) {
> >  			vq_err(vq, "Unexpected header len for RX: "
> > @@ -252,13 +368,18 @@ static void handle_rx(struct vhost_net *net)
> >  			       iov_length(vq->hdr, s), hdr_size);
> >  			break;
> >  		}
> > -		err = sock->ops->recvmsg(NULL, sock, &msg,
> > +
> > +		err = sock->ops->recvmsg(iocb, sock, &msg,
> >  					 len, MSG_DONTWAIT | MSG_TRUNC);
> >  		/* TODO: Check specific error and bomb out unless EAGAIN? */
> >  		if (err < 0) {
> >  			vhost_discard_vq_desc(vq);
> >  			break;
> >  		}
> > +
> > +		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > +			continue;
> > +
> >  		/* TODO: Should check and handle checksum. */
> >  		if (err > len) {
> >  			pr_err("Discarded truncated rx packet: "
> > @@ -284,10 +405,13 @@ static void handle_rx(struct vhost_net *net)
> >  		}
> >  	}
> >  
> > +	handle_async_rx_events_notify(net, vq);
> > +
> >  	mutex_unlock(&vq->mutex);
> >  	unuse_mm(net->dev.mm);
> >  }
> >  
> > +
> >  static void handle_tx_kick(struct work_struct *work)
> >  {
> >  	struct vhost_virtqueue *vq;
> > @@ -338,6 +462,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> >  	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> >  	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> >  	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
> > +	n->cache = NULL;
> >  	return 0;
> >  }
> >  
> > @@ -398,6 +523,17 @@ static void vhost_net_flush(struct vhost_net *n)
> >  	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
> >  }
> >  
> > +static void vhost_notifier_cleanup(struct vhost_net *n)
> > +{
> > +	struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
> > +	struct kiocb *iocb = NULL;
> > +	if (n->cache) {
> > +		while ((iocb = notify_dequeue(vq)) != NULL)
> > +			kmem_cache_free(n->cache, iocb);
> > +		kmem_cache_destroy(n->cache);
> > +	}
> > +}
> > +
> >  static int vhost_net_release(struct inode *inode, struct file *f)
> >  {
> >  	struct vhost_net *n = f->private_data;
> > @@ -414,6 +550,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
> >  	/* We do an extra flush before freeing memory,
> >  	 * since jobs can re-queue themselves. */
> >  	vhost_net_flush(n);
> > +	vhost_notifier_cleanup(n);
> >  	kfree(n);
> >  	return 0;
> >  }
> > @@ -462,7 +599,19 @@ static struct socket *get_tun_socket(int fd)
> >  	return sock;
> >  }
> >  
> > -static struct socket *get_socket(int fd)
> > +static struct socket *get_mp_socket(int fd)
> > +{
> > +	struct file *file = fget(fd);
> > +	struct socket *sock;
> > +	if (!file)
> > +		return ERR_PTR(-EBADF);
> > +	sock = mp_get_socket(file);
> > +	if (IS_ERR(sock))
> > +		fput(file);
> > +	return sock;
> > +}
> > +
> > +static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
> >  {
> >  	struct socket *sock;
> >  	if (fd == -1)
> > @@ -473,9 +622,31 @@ static struct socket *get_socket(int fd)
> >  	sock = get_tun_socket(fd);
> >  	if (!IS_ERR(sock))
> >  		return sock;
> > +	sock = get_mp_socket(fd);
> > +	if (!IS_ERR(sock)) {
> > +		vq->link_state = VHOST_VQ_LINK_ASYNC;
> > +		return sock;
> > +	}
> >  	return ERR_PTR(-ENOTSOCK);
> >  }
> >  
> > +static void vhost_init_link_state(struct vhost_net *n, int index)
> > +{
> > +	struct vhost_virtqueue *vq = n->vqs + index;
> > +
> > +	WARN_ON(!mutex_is_locked(&vq->mutex));
> > +	if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > +		vq->receiver = NULL;
> > +		INIT_LIST_HEAD(&vq->notifier);
> > +		spin_lock_init(&vq->notify_lock);
> > +		if (!n->cache) {
> > +			n->cache = kmem_cache_create("vhost_kiocb",
> > +					sizeof(struct kiocb), 0,
> > +					SLAB_HWCACHE_ALIGN, NULL);
> > +		}
> > +	}
> > +}
> > +
> >  static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> >  {
> >  	struct socket *sock, *oldsock;
> > @@ -493,12 +664,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> >  	}
> >  	vq = n->vqs + index;
> >  	mutex_lock(&vq->mutex);
> > -	sock = get_socket(fd);
> > +	vq->link_state = VHOST_VQ_LINK_SYNC;
> > +	sock = get_socket(vq, fd);
> >  	if (IS_ERR(sock)) {
> >  		r = PTR_ERR(sock);
> >  		goto err;
> >  	}
> >  
> > +	vhost_init_link_state(n, index);
> > +
> >  	/* start polling new socket */
> >  	oldsock = vq->private_data;
> >  	if (sock == oldsock)
> > @@ -507,8 +681,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> >  	vhost_net_disable_vq(n, vq);
> >  	rcu_assign_pointer(vq->private_data, sock);
> >  	vhost_net_enable_vq(n, vq);
> > -	mutex_unlock(&vq->mutex);
> >  done:
> > +	mutex_unlock(&vq->mutex);
> >  	mutex_unlock(&n->dev.mutex);
> >  	if (oldsock) {
> >  		vhost_net_flush_vq(n, index);
> > @@ -516,6 +690,7 @@ done:
> >  	}
> >  	return r;
> >  err:
> > +	mutex_unlock(&vq->mutex);
> >  	mutex_unlock(&n->dev.mutex);
> >  	return r;
> >  }
> > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > index d1f0453..cffe39a 100644
> > --- a/drivers/vhost/vhost.h
> > +++ b/drivers/vhost/vhost.h
> > @@ -43,6 +43,11 @@ struct vhost_log {
> >  	u64 len;
> >  };
> >  
> > +enum vhost_vq_link_state {
> > +	VHOST_VQ_LINK_SYNC = 	0,
> > +	VHOST_VQ_LINK_ASYNC = 	1,
> > +};
> > +
> >  /* The virtqueue structure describes a queue attached to a device. */
> >  struct vhost_virtqueue {
> >  	struct vhost_dev *dev;
> > @@ -96,6 +101,11 @@ struct vhost_virtqueue {
> >  	/* Log write descriptors */
> >  	void __user *log_base;
> >  	struct vhost_log log[VHOST_NET_MAX_SG];
> > +	/*Differiate async socket for 0-copy from normal*/
> > +	enum vhost_vq_link_state link_state;
> > +	struct list_head notifier;
> > +	spinlock_t notify_lock;
> > +	void (*receiver)(struct vhost_virtqueue *);
> >  };
> >  
> >  struct vhost_dev {
> > -- 
> > 1.5.4.4

^ permalink raw reply

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: Herbert Xu @ 2010-04-04 11:31 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev
In-Reply-To: <20100404112636.GA11061@gondor.apana.org.au>

On Sun, Apr 04, 2010 at 07:26:36PM +0800, Herbert Xu wrote:
> 
> Fine, move key into flow_cache_entry but the rest should still
> work, no?
> 
> struct flow_cache_object {
> 	u16			family;
> 	u8			dir;
> 	u32			genid;
> 	struct flow_cache_ops  *ops;
> };
> 
> struct flow_cache_entry {
> 	struct flow_cache_entry	*next;
> 	struct flowi key;
> 	struct flow_cache_object *obj;
> };
> 
> struct xfrm_policy {
> 	struct flow_cache_object flo;
> 	...
> };

OK this doesn't work either as we still have NULL objects for
now.  But I still think even if the ops pointer is the only
member in flow_cache_object, it looks better than returning the
nested ops pointer directly from flow_cache_lookup.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: Herbert Xu @ 2010-04-04 11:26 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev
In-Reply-To: <4BB872CF.2030202@iki.fi>

On Sun, Apr 04, 2010 at 02:06:55PM +0300, Timo Teräs wrote:
>
> It would still not work for policies. For every policy X we
> can get N+1 different matches with separate struct flowi contents.
> It's not possible to put single struct flowi or any other of
> the flow details in to xfrm_policy. It's a N-to-1 mapping. Not
> a 1-to-1 mapping.

Fine, move key into flow_cache_entry but the rest should still
work, no?

struct flow_cache_object {
	u16			family;
	u8			dir;
	u32			genid;
	struct flow_cache_ops  *ops;
};

struct flow_cache_entry {
	struct flow_cache_entry	*next;
	struct flowi key;
	struct flow_cache_object *obj;
};

struct xfrm_policy {
	struct flow_cache_object flo;
	...
};

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] vhost: Make it more scalable by creating a vhost thread per device.
From: Michael S. Tsirkin @ 2010-04-04 11:14 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: Tom Lendacky, netdev, kvm@vger.kernel.org
In-Reply-To: <1270229480.13897.8.camel@w-sridhar.beaverton.ibm.com>

On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote:
> Make vhost scalable by creating a separate vhost thread per vhost
> device. This provides better scaling across multiple guests and with
> multiple interfaces in a guest.

Thanks for looking into this. An alternative approach is
to simply replace create_singlethread_workqueue with
create_workqueue which would get us a thread per host CPU.

It seems that in theory this should be the optimal approach
wrt CPU locality, however, in practice a single thread
seems to get better numbers. I have a TODO to investigate this.
Could you try looking into this?

> 
> I am seeing better aggregated througput/latency when running netperf
> across multiple guests or multiple interfaces in a guest in parallel
> with this patch.

Any numbers? What happens to CPU utilization?

> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index a6a88df..29aa80f 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -339,8 +339,10 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>  		return r;
>  	}
>  
> -	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> -	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> +	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT,
> +			&n->dev);
> +	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN,
> +			&n->dev);
>  	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
>  
>  	f->private_data = n;
> @@ -643,25 +645,14 @@ static struct miscdevice vhost_net_misc = {
>  
>  int vhost_net_init(void)
>  {
> -	int r = vhost_init();
> -	if (r)
> -		goto err_init;
> -	r = misc_register(&vhost_net_misc);
> -	if (r)
> -		goto err_reg;
> -	return 0;
> -err_reg:
> -	vhost_cleanup();
> -err_init:
> -	return r;
> -
> +	return misc_register(&vhost_net_misc);
>  }
> +
>  module_init(vhost_net_init);
>  
>  void vhost_net_exit(void)
>  {
>  	misc_deregister(&vhost_net_misc);
> -	vhost_cleanup();
>  }
>  module_exit(vhost_net_exit);
>  
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 7bd7a1e..243f4d3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -36,8 +36,6 @@ enum {
>  	VHOST_MEMORY_F_LOG = 0x1,
>  };
>  
> -static struct workqueue_struct *vhost_workqueue;
> -
>  static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
>  			    poll_table *pt)
>  {
> @@ -56,18 +54,19 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
>  	if (!((unsigned long)key & poll->mask))
>  		return 0;
>  
> -	queue_work(vhost_workqueue, &poll->work);
> +	queue_work(poll->dev->wq, &poll->work);
>  	return 0;
>  }
>  
>  /* Init poll structure */
>  void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> -		     unsigned long mask)
> +		     unsigned long mask, struct vhost_dev *dev)
>  {
>  	INIT_WORK(&poll->work, func);
>  	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
>  	init_poll_funcptr(&poll->table, vhost_poll_func);
>  	poll->mask = mask;
> +	poll->dev = dev;
>  }
>  
>  /* Start polling a file. We add ourselves to file's wait queue. The caller must
> @@ -96,7 +95,7 @@ void vhost_poll_flush(struct vhost_poll *poll)
>  
>  void vhost_poll_queue(struct vhost_poll *poll)
>  {
> -	queue_work(vhost_workqueue, &poll->work);
> +	queue_work(poll->dev->wq, &poll->work);
>  }
>  
>  static void vhost_vq_reset(struct vhost_dev *dev,
> @@ -128,6 +127,11 @@ long vhost_dev_init(struct vhost_dev *dev,
>  		    struct vhost_virtqueue *vqs, int nvqs)
>  {
>  	int i;
> +
> +	dev->wq = create_singlethread_workqueue("vhost");
> +	if (!dev->wq)
> +		return -ENOMEM;
> +
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
>  	mutex_init(&dev->mutex);
> @@ -143,7 +147,7 @@ long vhost_dev_init(struct vhost_dev *dev,
>  		if (dev->vqs[i].handle_kick)
>  			vhost_poll_init(&dev->vqs[i].poll,
>  					dev->vqs[i].handle_kick,
> -					POLLIN);
> +					POLLIN, dev);
>  	}
>  	return 0;
>  }
> @@ -216,6 +220,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  	if (dev->mm)
>  		mmput(dev->mm);
>  	dev->mm = NULL;
> +
> +	destroy_workqueue(dev->wq);
>  }
>  
>  static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
> @@ -1095,16 +1101,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
>  		vq_err(vq, "Failed to enable notification at %p: %d\n",
>  		       &vq->used->flags, r);
>  }
> -
> -int vhost_init(void)
> -{
> -	vhost_workqueue = create_singlethread_workqueue("vhost");
> -	if (!vhost_workqueue)
> -		return -ENOMEM;
> -	return 0;
> -}
> -
> -void vhost_cleanup(void)
> -{
> -	destroy_workqueue(vhost_workqueue);
> -}
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 44591ba..60fefd0 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -29,10 +29,11 @@ struct vhost_poll {
>  	/* struct which will handle all actual work. */
>  	struct work_struct        work;
>  	unsigned long		  mask;
> +	struct vhost_dev	 *dev;
>  };
>  
>  void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> -		     unsigned long mask);
> +		     unsigned long mask, struct vhost_dev *dev);
>  void vhost_poll_start(struct vhost_poll *poll, struct file *file);
>  void vhost_poll_stop(struct vhost_poll *poll);
>  void vhost_poll_flush(struct vhost_poll *poll);
> @@ -110,6 +111,7 @@ struct vhost_dev {
>  	int nvqs;
>  	struct file *log_file;
>  	struct eventfd_ctx *log_ctx;
> +	struct workqueue_struct *wq;
>  };
>  
>  long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
> @@ -136,9 +138,6 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>  		    unsigned int log_num, u64 len);
>  
> -int vhost_init(void);
> -void vhost_cleanup(void);
> -
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
>  		if ((vq)->error_ctx)                               \
> 
> 
> 

^ permalink raw reply

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: Timo Teräs @ 2010-04-04 11:06 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev
In-Reply-To: <20100404110014.GA10864@gondor.apana.org.au>

Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 01:50:16PM +0300, Timo Teräs wrote:
>> Because flow_cache_entry is per-cpu, and multiple entries (due to
>> different flows matching same policies, or same flow having multiple
>> per-cpu entries) can point to same policy. If we cached "dummy" objects
>> for even policies, then this would be better approach.
> 
> Oh yes of course.
> 
> But what we could do is embed most of flow_cache_entry into
> xfrm_policy (and xdst in your latter patches) along with the
> ops pointer.
> 
> Like this:
> 
> struct flow_cache_object {
> 	u16			family;
> 	u8			dir;
> 	u32			genid;
> 	struct flowi		key;
> 	struct flow_cache_ops **ops;
> };
> 
> struct flow_cache_entry {
> 	struct flow_cache_entry	*next;
> 	struct flow_cache_object *obj;
> };
> 
> struct xfrm_policy {
> 	struct flow_cache_object flo;
> 	...
> };
> 
> What do you think?

It would still not work for policies. For every policy X we
can get N+1 different matches with separate struct flowi contents.
It's not possible to put single struct flowi or any other of
the flow details in to xfrm_policy. It's a N-to-1 mapping. Not
a 1-to-1 mapping.


^ permalink raw reply

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: Herbert Xu @ 2010-04-04 11:00 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev
In-Reply-To: <4BB86EE8.5090203@iki.fi>

On Sun, Apr 04, 2010 at 01:50:16PM +0300, Timo Teräs wrote:
>
> Because flow_cache_entry is per-cpu, and multiple entries (due to
> different flows matching same policies, or same flow having multiple
> per-cpu entries) can point to same policy. If we cached "dummy" objects
> for even policies, then this would be better approach.

Oh yes of course.

But what we could do is embed most of flow_cache_entry into
xfrm_policy (and xdst in your latter patches) along with the
ops pointer.

Like this:

struct flow_cache_object {
	u16			family;
	u8			dir;
	u32			genid;
	struct flowi		key;
	struct flow_cache_ops **ops;
};

struct flow_cache_entry {
	struct flow_cache_entry	*next;
	struct flow_cache_object *obj;
};

struct xfrm_policy {
	struct flow_cache_object flo;
	...
};

What do you think?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: Timo Teräs @ 2010-04-04 10:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev
In-Reply-To: <20100404104230.GA10368@gondor.apana.org.au>

Herbert Xu wrote:
> On Thu, Apr 01, 2010 at 03:52:17PM +0300, Timo Teras wrote:
>> -extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
>> -			       u8 dir, flow_resolve_t resolver);
>> +struct flow_cache_entry_ops {
>> +	struct flow_cache_entry_ops ** (*get)(struct flow_cache_entry_ops **);
>> +	int (*check)(struct flow_cache_entry_ops **);
>> +	void (*delete)(struct flow_cache_entry_ops **);
>> +};
>> +
>> +typedef struct flow_cache_entry_ops **(*flow_resolve_t)(
>> +		struct net *net, struct flowi *key, u16 family,
>> +		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx);
> 
> OK this bit really bugs me.
> 
> When I first looked at it, my reaction was why on earth are we
> returning an ops pointer? Only after some digging around do I see
> the fact that this ops pointer is in fact embedded in xfrm_policy.
> 
> How about embedding flow_cache_entry in xfrm_policy instead? Returning
> flow_cache_entry * would make a lot more sense than a nested pointer
> to flow_cache_entry_ops.

Because flow_cache_entry is per-cpu, and multiple entries (due to
different flows matching same policies, or same flow having multiple
per-cpu entries) can point to same policy. If we cached "dummy" objects
for even policies, then this would be better approach.

This would make actually sense, since it'd be useful to cache all
policies involved in check path (main + sub policy refs). In which
case we might want to make the ops 'per flow cache instance' instead
of 'per cache entry'.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox