* [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-03-29 22:55 [PATCH 0/9] I/OAT Chris Leech
@ 2006-03-29 22:55 ` Chris Leech
2006-03-30 8:01 ` Kumar Gala
0 siblings, 1 reply; 30+ messages in thread
From: Chris Leech @ 2006-03-29 22:55 UTC (permalink / raw)
To: linux-kernel, netdev
Provides an API for offloading memory copies to DMA devices
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
drivers/Kconfig | 2
drivers/Makefile | 1
drivers/dma/Kconfig | 13 +
drivers/dma/Makefile | 1
drivers/dma/dmaengine.c | 405 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/dmaengine.h | 337 +++++++++++++++++++++++++++++++++++++
6 files changed, 759 insertions(+), 0 deletions(-)
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 9f5c0da..f89ac05 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -72,4 +72,6 @@ source "drivers/edac/Kconfig"
source "drivers/rtc/Kconfig"
+source "drivers/dma/Kconfig"
+
endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 4249552..9b808a6 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN) += sn/
obj-y += firmware/
obj-$(CONFIG_CRYPTO) += crypto/
obj-$(CONFIG_SUPERH) += sh/
+obj-$(CONFIG_DMA_ENGINE) += dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 0000000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu "DMA Engine support"
+
+config DMA_ENGINE
+ bool "Support for DMA engines"
+ ---help---
+ DMA engines offload copy operations from the CPU to dedicated
+ hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 0000000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 0000000..683456a
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code implements the DMA subsystem. It provides a HW-neutral interface
+ * for other kernel code to use asynchronous memory copy capabilities,
+ * if present, and allows different HW DMA drivers to register as providing
+ * this capability.
+ *
+ * Due to the fact we are accelerating what is already a relatively fast
+ * operation, the code goes to great lengths to avoid additional overhead,
+ * such as locking.
+ *
+ * LOCKING:
+ *
+ * The subsystem keeps two global lists, dma_device_list and dma_client_list.
+ * Both of these are protected by a spinlock, dma_list_lock.
+ *
+ * Each device has a channels list, which runs unlocked but is never modified
+ * once the device is registered, it's just setup by the driver.
+ *
+ * Each client has a channels list, it's only modified under the client->lock
+ * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ *
+ * Each device has a kref, which is initialized to 1 when the device is
+ * registered. A kref_put is done for each class_device registered. When the
+ * class_device is released, the coresponding kref_put is done in the release
+ * method. Every time one of the device's channels is allocated to a client,
+ * a kref_get occurs. When the channel is freed, the coresponding kref_put
+ * happens. The device's release function does a completion, so
+ * unregister_device does a remove event, class_device_unregister, a kref_put
+ * for the first reference, then waits on the completion for all other
+ * references to finish.
+ *
+ * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
+ * with a kref and a per_cpu local_t. A single reference is set when on an
+ * ADDED event, and removed with a REMOVE event. Net DMA client takes an
+ * extra reference per outstanding transaction. The relase function does a
+ * kref_put on the device. -ChrisL
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+
+static DEFINE_SPINLOCK(dma_list_lock);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ unsigned long count = 0;
+ int i;
+
+ for_each_cpu(i)
+ count += per_cpu_ptr(chan->local, i)->memcpy_count;
+
+ return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ unsigned long count = 0;
+ int i;
+
+ for_each_cpu(i)
+ count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+
+ return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_in_use(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+
+ return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+}
+
+static struct class_device_attribute dma_class_attrs[] = {
+ __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+ __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+ __ATTR(in_use, S_IRUGO, show_in_use, NULL),
+ __ATTR_NULL
+};
+
+static void dma_async_device_cleanup(struct kref *kref);
+
+static void dma_class_dev_release(struct class_device *cd)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static struct class dma_devclass = {
+ .name = "dma",
+ .class_dev_attrs = dma_class_attrs,
+ .release = dma_class_dev_release,
+};
+
+/* --- client and device registration --- */
+
+/**
+ * dma_client_chan_alloc - try to allocate a channel to a client
+ * @client: &dma_client
+ *
+ * Called with dma_list_lock held.
+ */
+static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+{
+ struct dma_device *device;
+ struct dma_chan *chan;
+ unsigned long flags;
+ int desc; /* allocated descriptor count */
+
+ /* Find a channel, any DMA engine will do */
+ list_for_each_entry(device, &dma_device_list, global_node) {
+ list_for_each_entry(chan, &device->channels, device_node) {
+ if (chan->client)
+ continue;
+
+ desc = chan->device->device_alloc_chan_resources(chan);
+ if (desc >= 0) {
+ kref_get(&device->refcount);
+ kref_init(&chan->refcount);
+ chan->slow_ref = 0;
+ INIT_RCU_HEAD(&chan->rcu);
+ chan->client = client;
+ spin_lock_irqsave(&client->lock, flags);
+ list_add_tail_rcu(&chan->client_node,
+ &client->channels);
+ spin_unlock_irqrestore(&client->lock, flags);
+ return chan;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * dma_client_chan_free - release a DMA channel
+ * @chan: &dma_chan
+ */
+void dma_chan_cleanup(struct kref *kref)
+{
+ struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
+ chan->device->device_free_chan_resources(chan);
+ chan->client = NULL;
+ kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static void dma_chan_free_rcu(struct rcu_head *rcu)
+{
+ struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
+ int bias = 0x7FFFFFFF;
+ int i;
+ for_each_cpu(i)
+ bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
+ atomic_sub(bias, &chan->refcount.refcount);
+ kref_put(&chan->refcount, dma_chan_cleanup);
+}
+
+static void dma_client_chan_free(struct dma_chan *chan)
+{
+ atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
+ chan->slow_ref = 1;
+ call_rcu(&chan->rcu, dma_chan_free_rcu);
+}
+
+/**
+ * dma_chans_rebalance - reallocate channels to clients
+ *
+ * When the number of DMA channel in the system changes,
+ * channels need to be rebalanced among clients
+ */
+static void dma_chans_rebalance(void)
+{
+ struct dma_client *client;
+ struct dma_chan *chan;
+ unsigned long flags;
+
+ spin_lock(&dma_list_lock);
+ list_for_each_entry(client, &dma_client_list, global_node) {
+ while (client->chans_desired > client->chan_count) {
+ chan = dma_client_chan_alloc(client);
+ if (!chan)
+ break;
+ client->chan_count++;
+ client->event_callback(client,
+ chan,
+ DMA_RESOURCE_ADDED);
+ }
+ while (client->chans_desired < client->chan_count) {
+ spin_lock_irqsave(&client->lock, flags);
+ chan = list_entry(client->channels.next,
+ struct dma_chan,
+ client_node);
+ list_del_rcu(&chan->client_node);
+ spin_unlock_irqrestore(&client->lock, flags);
+ client->chan_count--;
+ client->event_callback(client,
+ chan,
+ DMA_RESOURCE_REMOVED);
+ dma_client_chan_free(chan);
+ }
+ }
+ spin_unlock(&dma_list_lock);
+}
+
+/**
+ * dma_async_client_register - allocate and register a &dma_client
+ * @event_callback: callback for notification of channel addition/removal
+ */
+struct dma_client *dma_async_client_register(dma_event_callback event_callback)
+{
+ struct dma_client *client;
+
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (!client)
+ return NULL;
+
+ INIT_LIST_HEAD(&client->channels);
+ spin_lock_init(&client->lock);
+ client->chans_desired = 0;
+ client->chan_count = 0;
+ client->event_callback = event_callback;
+
+ spin_lock(&dma_list_lock);
+ list_add_tail(&client->global_node, &dma_client_list);
+ spin_unlock(&dma_list_lock);
+
+ return client;
+}
+
+/**
+ * dma_async_client_unregister - unregister a client and free the &dma_client
+ * @client:
+ *
+ * Force frees any allocated DMA channels, frees the &dma_client memory
+ */
+void dma_async_client_unregister(struct dma_client *client)
+{
+ struct dma_chan *chan;
+
+ if (!client)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(chan, &client->channels, client_node)
+ dma_client_chan_free(chan);
+ rcu_read_unlock();
+
+ spin_lock(&dma_list_lock);
+ list_del(&client->global_node);
+ spin_unlock(&dma_list_lock);
+
+ kfree(client);
+ dma_chans_rebalance();
+}
+
+/**
+ * dma_async_client_chan_request - request DMA channels
+ * @client: &dma_client
+ * @number: count of DMA channels requested
+ *
+ * Clients call dma_async_client_chan_request() to specify how many
+ * DMA channels they need, 0 to free all currently allocated.
+ * The resulting allocations/frees are indicated to the client via the
+ * event callback.
+ */
+void dma_async_client_chan_request(struct dma_client *client,
+ unsigned int number)
+{
+ client->chans_desired = number;
+ dma_chans_rebalance();
+}
+
+/**
+ * dma_async_device_register -
+ * @device: &dma_device
+ */
+int dma_async_device_register(struct dma_device *device)
+{
+ static int id;
+ int chancnt = 0;
+ struct dma_chan* chan;
+
+ if (!device)
+ return -ENODEV;
+
+ init_completion(&device->done);
+ kref_init(&device->refcount);
+ device->dev_id = id++;
+
+ /* represent channels in sysfs. Probably want devs too */
+ list_for_each_entry(chan, &device->channels, device_node) {
+ chan->local = alloc_percpu(typeof(*chan->local));
+ if (chan->local == NULL)
+ continue;
+
+ chan->chan_id = chancnt++;
+ chan->class_dev.class = &dma_devclass;
+ chan->class_dev.dev = NULL;
+ snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d",
+ device->dev_id, chan->chan_id);
+
+ kref_get(&device->refcount);
+ class_device_register(&chan->class_dev);
+ }
+
+ spin_lock(&dma_list_lock);
+ list_add_tail(&device->global_node, &dma_device_list);
+ spin_unlock(&dma_list_lock);
+
+ dma_chans_rebalance();
+
+ return 0;
+}
+
+/**
+ * dma_async_device_unregister -
+ * @device: &dma_device
+ */
+static void dma_async_device_cleanup(struct kref *kref)
+{
+ struct dma_device *device;
+
+ device = container_of(kref, struct dma_device, refcount);
+ complete(&device->done);
+}
+
+void dma_async_device_unregister(struct dma_device* device)
+{
+ struct dma_chan *chan;
+ unsigned long flags;
+
+ spin_lock(&dma_list_lock);
+ list_del(&device->global_node);
+ spin_unlock(&dma_list_lock);
+
+ list_for_each_entry(chan, &device->channels, device_node) {
+ if (chan->client) {
+ spin_lock_irqsave(&chan->client->lock, flags);
+ list_del(&chan->client_node);
+ chan->client->chan_count--;
+ spin_unlock_irqrestore(&chan->client->lock, flags);
+ chan->client->event_callback(chan->client,
+ chan,
+ DMA_RESOURCE_REMOVED);
+ dma_client_chan_free(chan);
+ }
+ class_device_unregister(&chan->class_dev);
+ }
+ dma_chans_rebalance();
+
+ kref_put(&device->refcount, dma_async_device_cleanup);
+ wait_for_completion(&device->done);
+}
+
+static int __init dma_bus_init(void)
+{
+ spin_lock_init(&dma_list_lock);
+ return class_register(&dma_devclass);
+}
+
+subsys_initcall(dma_bus_init);
+
+EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dma_async_client_chan_request);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_complete);
+EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_device_register);
+EXPORT_SYMBOL(dma_async_device_unregister);
+EXPORT_SYMBOL(dma_chan_cleanup);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
new file mode 100644
index 0000000..3078154
--- /dev/null
+++ b/include/linux/dmaengine.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef DMAENGINE_H
+#define DMAENGINE_H
+#include <linux/config.h>
+#ifdef CONFIG_DMA_ENGINE
+
+#include <linux/device.h>
+#include <linux/uio.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/rcupdate.h>
+
+/**
+ * enum dma_event - resource PNP/power managment events
+ * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
+ * @DMA_RESOURCE_RESUME: DMA device returning to full power
+ * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_REMOVED: DMA device removed from the system
+ */
+enum dma_event {
+ DMA_RESOURCE_SUSPEND,
+ DMA_RESOURCE_RESUME,
+ DMA_RESOURCE_ADDED,
+ DMA_RESOURCE_REMOVED,
+};
+
+/**
+ * typedef dma_cookie_t
+ *
+ * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
+ */
+typedef s32 dma_cookie_t;
+
+#define dma_submit_error(cookie) ((cookie) < 0 ? 1 : 0)
+
+/**
+ * enum dma_status - DMA transaction status
+ * @DMA_SUCCESS: transaction completed successfully
+ * @DMA_IN_PROGRESS: transaction not yet processed
+ * @DMA_ERROR: transaction failed
+ */
+enum dma_status {
+ DMA_SUCCESS,
+ DMA_IN_PROGRESS,
+ DMA_ERROR,
+};
+
+/**
+ * struct dma_chan_percpu - the per-CPU part of struct dma_chan
+ * @refcount: local_t used for open-coded "bigref" counting
+ * @memcpy_count: transaction counter
+ * @bytes_transferred: byte counter
+ */
+
+struct dma_chan_percpu {
+ local_t refcount;
+ /* stats */
+ unsigned long memcpy_count;
+ unsigned long bytes_transferred;
+};
+
+/**
+ * struct dma_chan - devices supply DMA channels, clients use them
+ * @client: ptr to the client user of this chan, will be NULL when unused
+ * @device: ptr to the dma device who supplies this channel, always !NULL
+ * @cookie: last cookie value returned to client
+ * @chan_id:
+ * @class_dev:
+ * @refcount: kref, used in "bigref" slow-mode
+ * @slow_ref:
+ * @rcu:
+ * @client_node: used to add this to the client chan list
+ * @device_node: used to add this to the device chan list
+ * @local: per-cpu pointer to a struct dma_chan_percpu
+ */
+struct dma_chan {
+ struct dma_client *client;
+ struct dma_device *device;
+ dma_cookie_t cookie;
+
+ /* sysfs */
+ int chan_id;
+ struct class_device class_dev;
+
+ struct kref refcount;
+ int slow_ref;
+ struct rcu_head rcu;
+
+ struct list_head client_node;
+ struct list_head device_node;
+ struct dma_chan_percpu *local;
+};
+
+void dma_chan_cleanup(struct kref *kref);
+
+static inline void dma_chan_get(struct dma_chan *chan)
+{
+ if (unlikely(chan->slow_ref))
+ kref_get(&chan->refcount);
+ else {
+ local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+ put_cpu();
+ }
+}
+
+static inline void dma_chan_put(struct dma_chan *chan)
+{
+ if (unlikely(chan->slow_ref))
+ kref_put(&chan->refcount, dma_chan_cleanup);
+ else {
+ local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+ put_cpu();
+ }
+}
+
+/*
+ * typedef dma_event_callback - function pointer to a DMA event callback
+ */
+typedef void (*dma_event_callback) (struct dma_client *client,
+ struct dma_chan *chan, enum dma_event event);
+
+/**
+ * struct dma_client - info on the entity making use of DMA services
+ * @event_callback: func ptr to call when something happens
+ * @chan_count: number of chans allocated
+ * @chans_desired: number of chans requested. Can be +/- chan_count
+ * @lock: protects access to the channels list
+ * @channels: the list of DMA channels allocated
+ * @global_node: list_head for global dma_client_list
+ */
+struct dma_client {
+ dma_event_callback event_callback;
+ unsigned int chan_count;
+ unsigned int chans_desired;
+
+ spinlock_t lock;
+ struct list_head channels;
+ struct list_head global_node;
+};
+
+/**
+ * struct dma_device - info on the entity supplying DMA services
+ * @chancnt: how many DMA channels are supported
+ * @channels: the list of struct dma_chan
+ * @global_node: list_head for global dma_device_list
+ * @refcount:
+ * @done:
+ * @dev_id:
+ * Other func ptrs: used to make use of this device's capabilities
+ */
+struct dma_device {
+
+ unsigned int chancnt;
+ struct list_head channels;
+ struct list_head global_node;
+
+ struct kref refcount;
+ struct completion done;
+
+ int dev_id;
+
+ int (*device_alloc_chan_resources)(struct dma_chan *chan);
+ void (*device_free_chan_resources)(struct dma_chan *chan);
+ dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
+ void *dest, void *src, size_t len);
+ dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
+ struct page *page, unsigned int offset, void *kdata,
+ size_t len);
+ dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
+ struct page *dest_pg, unsigned int dest_off,
+ struct page *src_pg, unsigned int src_off, size_t len);
+ enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+ dma_cookie_t cookie, dma_cookie_t *last,
+ dma_cookie_t *used);
+ void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+};
+
+/* --- public DMA engine API --- */
+
+struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_unregister(struct dma_client *client);
+void dma_async_client_chan_request(struct dma_client *client,
+ unsigned int number);
+
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+ void *dest, void *src, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+ struct page *page, unsigned int offset, void *kdata, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
+ kdata, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @dest_page: destination page
+ * @dest_off: offset in page to copy to
+ * @src_page: source page
+ * @src_off: offset in page to copy from
+ * @len: length
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+ struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+ unsigned int src_off, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
+ src_pg, src_off, len);
+}
+
+/**
+ * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * @chan:
+ *
+ * This allows drivers to push copies to HW in batches,
+ * reducing MMIO writes where possible.
+ */
+static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+{
+ return chan->device->device_memcpy_issue_pending(chan);
+}
+
+/**
+ * dma_async_memcpy_complete - poll for transaction completion
+ * @chan: DMA channel
+ * @cookie: transaction identifier to check status of
+ * @last: returns last completed cookie, can be NULL
+ * @used: returns last issued cookie, can be NULL
+ *
+ * If @last and @used are passed in, upon return they reflect the driver
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
+ */
+static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+ dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+{
+ return chan->device->device_memcpy_complete(chan, cookie, last, used);
+}
+
+/**
+ * dma_async_is_complete - test a cookie against chan state
+ * @cookie: transaction identifier to test status of
+ * @last_complete: last know completed transaction
+ * @last_used: last cookie value handed out
+ *
+ * dma_async_is_complete() is used in dma_async_memcpy_complete()
+ * the test logic is seperated for lightweight testing of multiple cookies
+ */
+static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
+ dma_cookie_t last_complete, dma_cookie_t last_used)
+{
+ if (last_complete <= last_used) {
+ if ((cookie <= last_complete) || (cookie > last_used))
+ return DMA_SUCCESS;
+ } else {
+ if ((cookie <= last_complete) && (cookie > last_used))
+ return DMA_SUCCESS;
+ }
+ return DMA_IN_PROGRESS;
+}
+
+
+/* --- DMA device --- */
+
+int dma_async_device_register(struct dma_device *device);
+void dma_async_device_unregister(struct dma_device *device);
+
+#endif /* CONFIG_DMA_ENGINE */
+#endif /* DMAENGINE_H */
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-03-29 22:55 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
@ 2006-03-30 8:01 ` Kumar Gala
2006-03-30 18:36 ` Andrew Grover
0 siblings, 1 reply; 30+ messages in thread
From: Kumar Gala @ 2006-03-30 8:01 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
On Mar 29, 2006, at 4:55 PM, Chris Leech wrote:
> Provides an API for offloading memory copies to DMA devices
>
> Signed-off-by: Chris Leech <christopher.leech@intel.com>
> ---
>
> drivers/Kconfig | 2
> drivers/Makefile | 1
> drivers/dma/Kconfig | 13 +
> drivers/dma/Makefile | 1
> drivers/dma/dmaengine.c | 405 ++++++++++++++++++++++++++++++++++
> +++++++++++
> include/linux/dmaengine.h | 337 ++++++++++++++++++++++++++++++++++
> +++
> 6 files changed, 759 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/Kconfig b/drivers/Kconfig
> index 9f5c0da..f89ac05 100644
> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -72,4 +72,6 @@ source "drivers/edac/Kconfig"
>
> source "drivers/rtc/Kconfig"
>
> +source "drivers/dma/Kconfig"
> +
> endmenu
> diff --git a/drivers/Makefile b/drivers/Makefile
> index 4249552..9b808a6 100644
> --- a/drivers/Makefile
> +++ b/drivers/Makefile
> @@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN) += sn/
> obj-y += firmware/
> obj-$(CONFIG_CRYPTO) += crypto/
> obj-$(CONFIG_SUPERH) += sh/
> +obj-$(CONFIG_DMA_ENGINE) += dma/
> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
> new file mode 100644
> index 0000000..f9ac4bc
> --- /dev/null
> +++ b/drivers/dma/Kconfig
> @@ -0,0 +1,13 @@
> +#
> +# DMA engine configuration
> +#
> +
> +menu "DMA Engine support"
> +
> +config DMA_ENGINE
> + bool "Support for DMA engines"
> + ---help---
> + DMA engines offload copy operations from the CPU to dedicated
> + hardware, allowing the copies to happen asynchronously.
> +
> +endmenu
> diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
> new file mode 100644
> index 0000000..10b7391
> --- /dev/null
> +++ b/drivers/dma/Makefile
> @@ -0,0 +1 @@
> +obj-y += dmaengine.o
> diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
> new file mode 100644
> index 0000000..683456a
> --- /dev/null
> +++ b/drivers/dma/dmaengine.c
> @@ -0,0 +1,405 @@
> +/*
> + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> modify it
> + * under the terms of the GNU General Public License as published
> by the Free
> + * Software Foundation; either version 2 of the License, or (at
> your option)
> + * any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of
> MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
> License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public
> License along with
> + * this program; if not, write to the Free Software Foundation,
> Inc., 59
> + * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * The full GNU General Public License is included in this
> distribution in the
> + * file called COPYING.
> + */
> +
> +/*
> + * This code implements the DMA subsystem. It provides a HW-
> neutral interface
> + * for other kernel code to use asynchronous memory copy
> capabilities,
> + * if present, and allows different HW DMA drivers to register as
> providing
> + * this capability.
> + *
> + * Due to the fact we are accelerating what is already a
> relatively fast
> + * operation, the code goes to great lengths to avoid additional
> overhead,
> + * such as locking.
> + *
> + * LOCKING:
> + *
> + * The subsystem keeps two global lists, dma_device_list and
> dma_client_list.
> + * Both of these are protected by a spinlock, dma_list_lock.
> + *
> + * Each device has a channels list, which runs unlocked but is
> never modified
> + * once the device is registered, it's just setup by the driver.
> + *
> + * Each client has a channels list, it's only modified under the
> client->lock
> + * and in an RCU callback, so it's safe to read under rcu_read_lock
> ().
> + *
> + * Each device has a kref, which is initialized to 1 when the
> device is
> + * registered. A kref_put is done for each class_device
> registered. When the
> + * class_device is released, the coresponding kref_put is done in
> the release
> + * method. Every time one of the device's channels is allocated to
> a client,
> + * a kref_get occurs. When the channel is freed, the coresponding
> kref_put
> + * happens. The device's release function does a completion, so
> + * unregister_device does a remove event, class_device_unregister,
> a kref_put
> + * for the first reference, then waits on the completion for all
> other
> + * references to finish.
> + *
> + * Each channel has an open-coded implementation of Rusty
> Russell's "bigref,"
> + * with a kref and a per_cpu local_t. A single reference is set
> when on an
> + * ADDED event, and removed with a REMOVE event. Net DMA client
> takes an
> + * extra reference per outstanding transaction. The relase
> function does a
> + * kref_put on the device. -ChrisL
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/dmaengine.h>
> +#include <linux/hardirq.h>
> +#include <linux/spinlock.h>
> +#include <linux/percpu.h>
> +#include <linux/rcupdate.h>
> +
> +static DEFINE_SPINLOCK(dma_list_lock);
> +static LIST_HEAD(dma_device_list);
> +static LIST_HEAD(dma_client_list);
> +
> +/* --- sysfs implementation --- */
> +
> +static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
> +{
> + struct dma_chan *chan = container_of(cd, struct dma_chan,
> class_dev);
> + unsigned long count = 0;
> + int i;
> +
> + for_each_cpu(i)
> + count += per_cpu_ptr(chan->local, i)->memcpy_count;
> +
> + return sprintf(buf, "%lu\n", count);
> +}
> +
> +static ssize_t show_bytes_transferred(struct class_device *cd,
> char *buf)
> +{
> + struct dma_chan *chan = container_of(cd, struct dma_chan,
> class_dev);
> + unsigned long count = 0;
> + int i;
> +
> + for_each_cpu(i)
> + count += per_cpu_ptr(chan->local, i)->bytes_transferred;
> +
> + return sprintf(buf, "%lu\n", count);
> +}
> +
What is the utility of exporting memcpy_count, and bytes_transferred
to userspace via sysfs? Is this really for debug (and thus should be
under debugfs?)
> +static ssize_t show_in_use(struct class_device *cd, char *buf)
> +{
> + struct dma_chan *chan = container_of(cd, struct dma_chan,
> class_dev);
> +
> + return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
> +}
> +
> +static struct class_device_attribute dma_class_attrs[] = {
> + __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
> + __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
> + __ATTR(in_use, S_IRUGO, show_in_use, NULL),
> + __ATTR_NULL
> +};
> +
[snip]
- kumar
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-03-30 8:01 ` Kumar Gala
@ 2006-03-30 18:36 ` Andrew Grover
2006-03-30 19:57 ` Kumar Gala
0 siblings, 1 reply; 30+ messages in thread
From: Andrew Grover @ 2006-03-30 18:36 UTC (permalink / raw)
To: Kumar Gala; +Cc: Chris Leech, linux-kernel, netdev
On 3/30/06, Kumar Gala <galak@kernel.crashing.org> wrote:
> What is the utility of exporting memcpy_count, and bytes_transferred
> to userspace via sysfs? Is this really for debug (and thus should be
> under debugfs?)
Well....it's true they're useful for debugging but I would put them in
the category of system statistics that shouldn't go in debugfs. I
think they are like /proc/interrupts' interrupt counts or the TX/RX
stats reported by ifconfig.
Regards -- Andy
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-03-30 18:36 ` Andrew Grover
@ 2006-03-30 19:57 ` Kumar Gala
2006-03-31 8:26 ` Ingo Oeser
0 siblings, 1 reply; 30+ messages in thread
From: Kumar Gala @ 2006-03-30 19:57 UTC (permalink / raw)
To: Andrew Grover; +Cc: Chris Leech, linux-kernel, netdev
On Mar 30, 2006, at 12:36 PM, Andrew Grover wrote:
> On 3/30/06, Kumar Gala <galak@kernel.crashing.org> wrote:
>> What is the utility of exporting memcpy_count, and bytes_transferred
>> to userspace via sysfs? Is this really for debug (and thus should be
>> under debugfs?)
>
> Well....it's true they're useful for debugging but I would put them in
> the category of system statistics that shouldn't go in debugfs. I
> think they are like /proc/interrupts' interrupt counts or the TX/RX
> stats reported by ifconfig.
Fair, but wouldn't it be better to have the association per client.
Maybe leave the one as a summary and have a dir per client with
similar stats that are for each client and add a per channel summary
at the top level as well.
- kumar
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-03-30 19:57 ` Kumar Gala
@ 2006-03-31 8:26 ` Ingo Oeser
2006-03-31 20:04 ` Andrew Grover
0 siblings, 1 reply; 30+ messages in thread
From: Ingo Oeser @ 2006-03-31 8:26 UTC (permalink / raw)
To: Kumar Gala; +Cc: Andrew Grover, Chris Leech, linux-kernel, netdev
Kumar Gala wrote:
> On Mar 30, 2006, at 12:36 PM, Andrew Grover wrote:
> > Well....it's true they're useful for debugging but I would put them in
> > the category of system statistics that shouldn't go in debugfs. I
> > think they are like /proc/interrupts' interrupt counts or the TX/RX
> > stats reported by ifconfig.
>
> Fair, but wouldn't it be better to have the association per client.
>
> Maybe leave the one as a summary and have a dir per client with
> similar stats that are for each client and add a per channel summary
> at the top level as well.
Such level of detail really belongs to debugging, IMHO.
I think, it would suffer to say, how many channels are in use.
So you can answer the question, whether your customers are actually
using this experimental technology.
If you want more, let them mount debugfs.
If it becomes really important, we can revisit this later.
Thats the advantage of files under debugfs
not being stable API in any way.
BTW: What is the actual frequency, at which such counters
will be incremented?
Regards
Ingo Oeser
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-03-31 8:26 ` Ingo Oeser
@ 2006-03-31 20:04 ` Andrew Grover
2006-03-31 20:06 ` Kumar Gala
0 siblings, 1 reply; 30+ messages in thread
From: Andrew Grover @ 2006-03-31 20:04 UTC (permalink / raw)
To: Ingo Oeser; +Cc: Kumar Gala, Chris Leech, linux-kernel, netdev
On 3/31/06, Ingo Oeser <netdev@axxeo.de> wrote:
> Kumar Gala wrote:
> > Fair, but wouldn't it be better to have the association per client.
> >
> > Maybe leave the one as a summary and have a dir per client with
> > similar stats that are for each client and add a per channel summary
> > at the top level as well.
> Such level of detail really belongs to debugging, IMHO.
[snip]
If we implemented more stats then yes debugfs sounds like it might be
the way to go.
> BTW: What is the actual frequency, at which such counters
> will be incremented?
Currently the code updates these variables (kept per cpu) every time a
copy is queued. See include/linux/dmaengine.h.
-- Andy
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-03-31 20:04 ` Andrew Grover
@ 2006-03-31 20:06 ` Kumar Gala
2006-03-31 20:27 ` Andrew Grover
0 siblings, 1 reply; 30+ messages in thread
From: Kumar Gala @ 2006-03-31 20:06 UTC (permalink / raw)
To: Andrew Grover; +Cc: Ingo Oeser, Chris Leech, linux-kernel, netdev
On Mar 31, 2006, at 2:04 PM, Andrew Grover wrote:
> On 3/31/06, Ingo Oeser <netdev@axxeo.de> wrote:
>> Kumar Gala wrote:
>>> Fair, but wouldn't it be better to have the association per client.
>>>
>>> Maybe leave the one as a summary and have a dir per client with
>>> similar stats that are for each client and add a per channel summary
>>> at the top level as well.
>> Such level of detail really belongs to debugging, IMHO.
> [snip]
>
> If we implemented more stats then yes debugfs sounds like it might be
> the way to go.
>
>> BTW: What is the actual frequency, at which such counters
>> will be incremented?
>
> Currently the code updates these variables (kept per cpu) every time a
> copy is queued. See include/linux/dmaengine.h.
Might it be better to update when the transfer is done incase of an
error?
- kumar
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-03-31 20:06 ` Kumar Gala
@ 2006-03-31 20:27 ` Andrew Grover
0 siblings, 0 replies; 30+ messages in thread
From: Andrew Grover @ 2006-03-31 20:27 UTC (permalink / raw)
To: Kumar Gala; +Cc: Ingo Oeser, Chris Leech, linux-kernel, netdev
On 3/31/06, Kumar Gala <galak@kernel.crashing.org> wrote:
> > Currently the code updates these variables (kept per cpu) every time a
> > copy is queued. See include/linux/dmaengine.h.
>
> Might it be better to update when the transfer is done incase of an
> error?
The queueing function is really in the best position to do this. It
knows the size of each request. However in the cleanup/status check
routine, all we know is the last request completed -- we don't know
the completed requests' sizes.
The other reason is that the DMA engine should never throw an error.
If it does then something is very wrong, we print scary warnings, and
up-to-date stats are the least of our problems.
Regards -- Andy
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-05-08 22:16 [PATCH 0/9] I/OAT network recv copy offload Chris Leech
@ 2006-05-08 22:17 ` Chris Leech
0 siblings, 0 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-08 22:17 UTC (permalink / raw)
To: linux-kernel, netdev
Provides an API for offloading memory copies to DMA devices
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
drivers/Kconfig | 2
drivers/Makefile | 1
drivers/dma/Kconfig | 13 +
drivers/dma/Makefile | 1
drivers/dma/dmaengine.c | 408 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/dmaengine.h | 337 +++++++++++++++++++++++++++++++++++++
6 files changed, 762 insertions(+), 0 deletions(-)
diff --git a/drivers/Kconfig b/drivers/Kconfig
index aeb5ab2..8b11ceb 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -72,4 +72,6 @@ source "drivers/edac/Kconfig"
source "drivers/rtc/Kconfig"
+source "drivers/dma/Kconfig"
+
endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 447d8e6..3c51703 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN) += sn/
obj-y += firmware/
obj-$(CONFIG_CRYPTO) += crypto/
obj-$(CONFIG_SUPERH) += sh/
+obj-$(CONFIG_DMA_ENGINE) += dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 0000000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu "DMA Engine support"
+
+config DMA_ENGINE
+ bool "Support for DMA engines"
+ ---help---
+ DMA engines offload copy operations from the CPU to dedicated
+ hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 0000000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 0000000..473c47b
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code implements the DMA subsystem. It provides a HW-neutral interface
+ * for other kernel code to use asynchronous memory copy capabilities,
+ * if present, and allows different HW DMA drivers to register as providing
+ * this capability.
+ *
+ * Due to the fact we are accelerating what is already a relatively fast
+ * operation, the code goes to great lengths to avoid additional overhead,
+ * such as locking.
+ *
+ * LOCKING:
+ *
+ * The subsystem keeps two global lists, dma_device_list and dma_client_list.
+ * Both of these are protected by a mutex, dma_list_mutex.
+ *
+ * Each device has a channels list, which runs unlocked but is never modified
+ * once the device is registered, it's just setup by the driver.
+ *
+ * Each client has a channels list, it's only modified under the client->lock
+ * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ *
+ * Each device has a kref, which is initialized to 1 when the device is
+ * registered. A kref_put is done for each class_device registered. When the
+ * class_device is released, the coresponding kref_put is done in the release
+ * method. Every time one of the device's channels is allocated to a client,
+ * a kref_get occurs. When the channel is freed, the coresponding kref_put
+ * happens. The device's release function does a completion, so
+ * unregister_device does a remove event, class_device_unregister, a kref_put
+ * for the first reference, then waits on the completion for all other
+ * references to finish.
+ *
+ * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
+ * with a kref and a per_cpu local_t. A single reference is set when on an
+ * ADDED event, and removed with a REMOVE event. Net DMA client takes an
+ * extra reference per outstanding transaction. The relase function does a
+ * kref_put on the device. -ChrisL
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/mutex.h>
+
+static DEFINE_MUTEX(dma_list_mutex);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ unsigned long count = 0;
+ int i;
+
+ for_each_cpu(i)
+ count += per_cpu_ptr(chan->local, i)->memcpy_count;
+
+ return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ unsigned long count = 0;
+ int i;
+
+ for_each_cpu(i)
+ count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+
+ return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_in_use(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+
+ return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+}
+
+static struct class_device_attribute dma_class_attrs[] = {
+ __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+ __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+ __ATTR(in_use, S_IRUGO, show_in_use, NULL),
+ __ATTR_NULL
+};
+
+static void dma_async_device_cleanup(struct kref *kref);
+
+static void dma_class_dev_release(struct class_device *cd)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static struct class dma_devclass = {
+ .name = "dma",
+ .class_dev_attrs = dma_class_attrs,
+ .release = dma_class_dev_release,
+};
+
+/* --- client and device registration --- */
+
+/**
+ * dma_client_chan_alloc - try to allocate a channel to a client
+ * @client: &dma_client
+ *
+ * Called with dma_list_mutex held.
+ */
+static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+{
+ struct dma_device *device;
+ struct dma_chan *chan;
+ unsigned long flags;
+ int desc; /* allocated descriptor count */
+
+ /* Find a channel, any DMA engine will do */
+ list_for_each_entry(device, &dma_device_list, global_node) {
+ list_for_each_entry(chan, &device->channels, device_node) {
+ if (chan->client)
+ continue;
+
+ desc = chan->device->device_alloc_chan_resources(chan);
+ if (desc >= 0) {
+ kref_get(&device->refcount);
+ kref_init(&chan->refcount);
+ chan->slow_ref = 0;
+ INIT_RCU_HEAD(&chan->rcu);
+ chan->client = client;
+ spin_lock_irqsave(&client->lock, flags);
+ list_add_tail_rcu(&chan->client_node,
+ &client->channels);
+ spin_unlock_irqrestore(&client->lock, flags);
+ return chan;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * dma_client_chan_free - release a DMA channel
+ * @chan: &dma_chan
+ */
+void dma_chan_cleanup(struct kref *kref)
+{
+ struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
+ chan->device->device_free_chan_resources(chan);
+ chan->client = NULL;
+ kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static void dma_chan_free_rcu(struct rcu_head *rcu)
+{
+ struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
+ int bias = 0x7FFFFFFF;
+ int i;
+ for_each_cpu(i)
+ bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
+ atomic_sub(bias, &chan->refcount.refcount);
+ kref_put(&chan->refcount, dma_chan_cleanup);
+}
+
+static void dma_client_chan_free(struct dma_chan *chan)
+{
+ atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
+ chan->slow_ref = 1;
+ call_rcu(&chan->rcu, dma_chan_free_rcu);
+}
+
+/**
+ * dma_chans_rebalance - reallocate channels to clients
+ *
+ * When the number of DMA channel in the system changes,
+ * channels need to be rebalanced among clients
+ */
+static void dma_chans_rebalance(void)
+{
+ struct dma_client *client;
+ struct dma_chan *chan;
+ unsigned long flags;
+
+ mutex_lock(&dma_list_mutex);
+
+ list_for_each_entry(client, &dma_client_list, global_node) {
+ while (client->chans_desired > client->chan_count) {
+ chan = dma_client_chan_alloc(client);
+ if (!chan)
+ break;
+ client->chan_count++;
+ client->event_callback(client,
+ chan,
+ DMA_RESOURCE_ADDED);
+ }
+ while (client->chans_desired < client->chan_count) {
+ spin_lock_irqsave(&client->lock, flags);
+ chan = list_entry(client->channels.next,
+ struct dma_chan,
+ client_node);
+ list_del_rcu(&chan->client_node);
+ spin_unlock_irqrestore(&client->lock, flags);
+ client->chan_count--;
+ client->event_callback(client,
+ chan,
+ DMA_RESOURCE_REMOVED);
+ dma_client_chan_free(chan);
+ }
+ }
+
+ mutex_unlock(&dma_list_mutex);
+}
+
+/**
+ * dma_async_client_register - allocate and register a &dma_client
+ * @event_callback: callback for notification of channel addition/removal
+ */
+struct dma_client *dma_async_client_register(dma_event_callback event_callback)
+{
+ struct dma_client *client;
+
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (!client)
+ return NULL;
+
+ INIT_LIST_HEAD(&client->channels);
+ spin_lock_init(&client->lock);
+ client->chans_desired = 0;
+ client->chan_count = 0;
+ client->event_callback = event_callback;
+
+ mutex_lock(&dma_list_mutex);
+ list_add_tail(&client->global_node, &dma_client_list);
+ mutex_unlock(&dma_list_mutex);
+
+ return client;
+}
+
+/**
+ * dma_async_client_unregister - unregister a client and free the &dma_client
+ * @client:
+ *
+ * Force frees any allocated DMA channels, frees the &dma_client memory
+ */
+void dma_async_client_unregister(struct dma_client *client)
+{
+ struct dma_chan *chan;
+
+ if (!client)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(chan, &client->channels, client_node)
+ dma_client_chan_free(chan);
+ rcu_read_unlock();
+
+ mutex_lock(&dma_list_mutex);
+ list_del(&client->global_node);
+ mutex_unlock(&dma_list_mutex);
+
+ kfree(client);
+ dma_chans_rebalance();
+}
+
+/**
+ * dma_async_client_chan_request - request DMA channels
+ * @client: &dma_client
+ * @number: count of DMA channels requested
+ *
+ * Clients call dma_async_client_chan_request() to specify how many
+ * DMA channels they need, 0 to free all currently allocated.
+ * The resulting allocations/frees are indicated to the client via the
+ * event callback.
+ */
+void dma_async_client_chan_request(struct dma_client *client,
+ unsigned int number)
+{
+ client->chans_desired = number;
+ dma_chans_rebalance();
+}
+
+/**
+ * dma_async_device_register -
+ * @device: &dma_device
+ */
+int dma_async_device_register(struct dma_device *device)
+{
+ static int id;
+ int chancnt = 0;
+ struct dma_chan* chan;
+
+ if (!device)
+ return -ENODEV;
+
+ init_completion(&device->done);
+ kref_init(&device->refcount);
+ device->dev_id = id++;
+
+ /* represent channels in sysfs. Probably want devs too */
+ list_for_each_entry(chan, &device->channels, device_node) {
+ chan->local = alloc_percpu(typeof(*chan->local));
+ if (chan->local == NULL)
+ continue;
+
+ chan->chan_id = chancnt++;
+ chan->class_dev.class = &dma_devclass;
+ chan->class_dev.dev = NULL;
+ snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d",
+ device->dev_id, chan->chan_id);
+
+ kref_get(&device->refcount);
+ class_device_register(&chan->class_dev);
+ }
+
+ mutex_lock(&dma_list_mutex);
+ list_add_tail(&device->global_node, &dma_device_list);
+ mutex_unlock(&dma_list_mutex);
+
+ dma_chans_rebalance();
+
+ return 0;
+}
+
+/**
+ * dma_async_device_unregister -
+ * @device: &dma_device
+ */
+static void dma_async_device_cleanup(struct kref *kref)
+{
+ struct dma_device *device;
+
+ device = container_of(kref, struct dma_device, refcount);
+ complete(&device->done);
+}
+
+void dma_async_device_unregister(struct dma_device* device)
+{
+ struct dma_chan *chan;
+ unsigned long flags;
+
+ mutex_lock(&dma_list_mutex);
+ list_del(&device->global_node);
+ mutex_unlock(&dma_list_mutex);
+
+ list_for_each_entry(chan, &device->channels, device_node) {
+ if (chan->client) {
+ spin_lock_irqsave(&chan->client->lock, flags);
+ list_del(&chan->client_node);
+ chan->client->chan_count--;
+ spin_unlock_irqrestore(&chan->client->lock, flags);
+ chan->client->event_callback(chan->client,
+ chan,
+ DMA_RESOURCE_REMOVED);
+ dma_client_chan_free(chan);
+ }
+ class_device_unregister(&chan->class_dev);
+ }
+ dma_chans_rebalance();
+
+ kref_put(&device->refcount, dma_async_device_cleanup);
+ wait_for_completion(&device->done);
+}
+
+static int __init dma_bus_init(void)
+{
+ mutex_init(&dma_list_mutex);
+ return class_register(&dma_devclass);
+}
+
+subsys_initcall(dma_bus_init);
+
+EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dma_async_client_chan_request);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_complete);
+EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_device_register);
+EXPORT_SYMBOL(dma_async_device_unregister);
+EXPORT_SYMBOL(dma_chan_cleanup);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
new file mode 100644
index 0000000..3078154
--- /dev/null
+++ b/include/linux/dmaengine.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef DMAENGINE_H
+#define DMAENGINE_H
+#include <linux/config.h>
+#ifdef CONFIG_DMA_ENGINE
+
+#include <linux/device.h>
+#include <linux/uio.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/rcupdate.h>
+
+/**
+ * enum dma_event - resource PNP/power managment events
+ * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
+ * @DMA_RESOURCE_RESUME: DMA device returning to full power
+ * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_REMOVED: DMA device removed from the system
+ */
+enum dma_event {
+ DMA_RESOURCE_SUSPEND,
+ DMA_RESOURCE_RESUME,
+ DMA_RESOURCE_ADDED,
+ DMA_RESOURCE_REMOVED,
+};
+
+/**
+ * typedef dma_cookie_t
+ *
+ * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
+ */
+typedef s32 dma_cookie_t;
+
+#define dma_submit_error(cookie) ((cookie) < 0 ? 1 : 0)
+
+/**
+ * enum dma_status - DMA transaction status
+ * @DMA_SUCCESS: transaction completed successfully
+ * @DMA_IN_PROGRESS: transaction not yet processed
+ * @DMA_ERROR: transaction failed
+ */
+enum dma_status {
+ DMA_SUCCESS,
+ DMA_IN_PROGRESS,
+ DMA_ERROR,
+};
+
+/**
+ * struct dma_chan_percpu - the per-CPU part of struct dma_chan
+ * @refcount: local_t used for open-coded "bigref" counting
+ * @memcpy_count: transaction counter
+ * @bytes_transferred: byte counter
+ */
+
+struct dma_chan_percpu {
+ local_t refcount;
+ /* stats */
+ unsigned long memcpy_count;
+ unsigned long bytes_transferred;
+};
+
+/**
+ * struct dma_chan - devices supply DMA channels, clients use them
+ * @client: ptr to the client user of this chan, will be NULL when unused
+ * @device: ptr to the dma device who supplies this channel, always !NULL
+ * @cookie: last cookie value returned to client
+ * @chan_id:
+ * @class_dev:
+ * @refcount: kref, used in "bigref" slow-mode
+ * @slow_ref:
+ * @rcu:
+ * @client_node: used to add this to the client chan list
+ * @device_node: used to add this to the device chan list
+ * @local: per-cpu pointer to a struct dma_chan_percpu
+ */
+struct dma_chan {
+ struct dma_client *client;
+ struct dma_device *device;
+ dma_cookie_t cookie;
+
+ /* sysfs */
+ int chan_id;
+ struct class_device class_dev;
+
+ struct kref refcount;
+ int slow_ref;
+ struct rcu_head rcu;
+
+ struct list_head client_node;
+ struct list_head device_node;
+ struct dma_chan_percpu *local;
+};
+
+void dma_chan_cleanup(struct kref *kref);
+
+static inline void dma_chan_get(struct dma_chan *chan)
+{
+ if (unlikely(chan->slow_ref))
+ kref_get(&chan->refcount);
+ else {
+ local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+ put_cpu();
+ }
+}
+
+static inline void dma_chan_put(struct dma_chan *chan)
+{
+ if (unlikely(chan->slow_ref))
+ kref_put(&chan->refcount, dma_chan_cleanup);
+ else {
+ local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+ put_cpu();
+ }
+}
+
+/*
+ * typedef dma_event_callback - function pointer to a DMA event callback
+ */
+typedef void (*dma_event_callback) (struct dma_client *client,
+ struct dma_chan *chan, enum dma_event event);
+
+/**
+ * struct dma_client - info on the entity making use of DMA services
+ * @event_callback: func ptr to call when something happens
+ * @chan_count: number of chans allocated
+ * @chans_desired: number of chans requested. Can be +/- chan_count
+ * @lock: protects access to the channels list
+ * @channels: the list of DMA channels allocated
+ * @global_node: list_head for global dma_client_list
+ */
+struct dma_client {
+ dma_event_callback event_callback;
+ unsigned int chan_count;
+ unsigned int chans_desired;
+
+ spinlock_t lock;
+ struct list_head channels;
+ struct list_head global_node;
+};
+
+/**
+ * struct dma_device - info on the entity supplying DMA services
+ * @chancnt: how many DMA channels are supported
+ * @channels: the list of struct dma_chan
+ * @global_node: list_head for global dma_device_list
+ * @refcount:
+ * @done:
+ * @dev_id:
+ * Other func ptrs: used to make use of this device's capabilities
+ */
+struct dma_device {
+
+ unsigned int chancnt;
+ struct list_head channels;
+ struct list_head global_node;
+
+ struct kref refcount;
+ struct completion done;
+
+ int dev_id;
+
+ int (*device_alloc_chan_resources)(struct dma_chan *chan);
+ void (*device_free_chan_resources)(struct dma_chan *chan);
+ dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
+ void *dest, void *src, size_t len);
+ dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
+ struct page *page, unsigned int offset, void *kdata,
+ size_t len);
+ dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
+ struct page *dest_pg, unsigned int dest_off,
+ struct page *src_pg, unsigned int src_off, size_t len);
+ enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+ dma_cookie_t cookie, dma_cookie_t *last,
+ dma_cookie_t *used);
+ void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+};
+
+/* --- public DMA engine API --- */
+
+struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_unregister(struct dma_client *client);
+void dma_async_client_chan_request(struct dma_client *client,
+ unsigned int number);
+
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+ void *dest, void *src, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+ struct page *page, unsigned int offset, void *kdata, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
+ kdata, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @dest_page: destination page
+ * @dest_off: offset in page to copy to
+ * @src_page: source page
+ * @src_off: offset in page to copy from
+ * @len: length
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+ struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+ unsigned int src_off, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
+ src_pg, src_off, len);
+}
+
+/**
+ * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * @chan:
+ *
+ * This allows drivers to push copies to HW in batches,
+ * reducing MMIO writes where possible.
+ */
+static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+{
+ return chan->device->device_memcpy_issue_pending(chan);
+}
+
+/**
+ * dma_async_memcpy_complete - poll for transaction completion
+ * @chan: DMA channel
+ * @cookie: transaction identifier to check status of
+ * @last: returns last completed cookie, can be NULL
+ * @used: returns last issued cookie, can be NULL
+ *
+ * If @last and @used are passed in, upon return they reflect the driver
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
+ */
+static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+ dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+{
+ return chan->device->device_memcpy_complete(chan, cookie, last, used);
+}
+
+/**
+ * dma_async_is_complete - test a cookie against chan state
+ * @cookie: transaction identifier to test status of
+ * @last_complete: last know completed transaction
+ * @last_used: last cookie value handed out
+ *
+ * dma_async_is_complete() is used in dma_async_memcpy_complete()
+ * the test logic is seperated for lightweight testing of multiple cookies
+ */
+static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
+ dma_cookie_t last_complete, dma_cookie_t last_used)
+{
+ if (last_complete <= last_used) {
+ if ((cookie <= last_complete) || (cookie > last_used))
+ return DMA_SUCCESS;
+ } else {
+ if ((cookie <= last_complete) && (cookie > last_used))
+ return DMA_SUCCESS;
+ }
+ return DMA_IN_PROGRESS;
+}
+
+
+/* --- DMA device --- */
+
+int dma_async_device_register(struct dma_device *device);
+void dma_async_device_unregister(struct dma_device *device);
+
+#endif /* CONFIG_DMA_ENGINE */
+#endif /* DMAENGINE_H */
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 0/9] I/OAT repost
@ 2006-05-24 0:16 Chris Leech
2006-05-24 0:20 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
` (9 more replies)
0 siblings, 10 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:16 UTC (permalink / raw)
To: linux-kernel, netdev
This is a repost of the I/OAT patches, the only changes from last time
are refreshing the patches and removing an unused macro that was causing
the vger spam filters to drop patch 2/9.
This patch series is the a full release of the Intel(R) I/O
Acceleration Technology (I/OAT) for Linux. It includes an in kernel API
for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
engine, and changes to the TCP stack to offload copies of received
networking data to application space.
These changes apply to Linus' tree as of commit
387e2b0439026aa738a9edca15a57e5c0bcb4dfc
[BRIDGE]: need to ref count the LLC sap
They are available to pull from
git://63.64.152.142/~cleech/linux-2.6 ioat-2.6.18
There are 9 patches in the series:
1) The memcpy offload APIs and class code
2) The Intel I/OAT DMA driver (ioatdma)
3) Core networking code to setup networking as a DMA memcpy client
4) Utility functions for sk_buff to iovec offloaded copy
5) Structure changes needed for TCP receive offload
6) Rename cleanup_rbuf to tcp_cleanup_rbuf
7) Make sk_eat_skb aware of early copied packets
8) Add a sysctl to tune the minimum offloaded I/O size for TCP
9) The main TCP receive offload changes
--
Chris Leech <christopher.leech@intel.com>
I/O Acceleration Technology Software Development
LAN Access Division / Digital Enterprise Group
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 0:48 ` Andrew Morton
` (2 more replies)
2006-05-24 0:20 ` [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine Chris Leech
` (8 subsequent siblings)
9 siblings, 3 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Provides an API for offloading memory copies to DMA devices
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
drivers/Kconfig | 2
drivers/Makefile | 1
drivers/dma/Kconfig | 13 +
drivers/dma/Makefile | 1
drivers/dma/dmaengine.c | 408 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/dmaengine.h | 337 +++++++++++++++++++++++++++++++++++++
6 files changed, 762 insertions(+), 0 deletions(-)
diff --git a/drivers/Kconfig b/drivers/Kconfig
index aeb5ab2..8b11ceb 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -72,4 +72,6 @@ source "drivers/edac/Kconfig"
source "drivers/rtc/Kconfig"
+source "drivers/dma/Kconfig"
+
endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 447d8e6..3c51703 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN) += sn/
obj-y += firmware/
obj-$(CONFIG_CRYPTO) += crypto/
obj-$(CONFIG_SUPERH) += sh/
+obj-$(CONFIG_DMA_ENGINE) += dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 0000000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu "DMA Engine support"
+
+config DMA_ENGINE
+ bool "Support for DMA engines"
+ ---help---
+ DMA engines offload copy operations from the CPU to dedicated
+ hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 0000000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 0000000..473c47b
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code implements the DMA subsystem. It provides a HW-neutral interface
+ * for other kernel code to use asynchronous memory copy capabilities,
+ * if present, and allows different HW DMA drivers to register as providing
+ * this capability.
+ *
+ * Due to the fact we are accelerating what is already a relatively fast
+ * operation, the code goes to great lengths to avoid additional overhead,
+ * such as locking.
+ *
+ * LOCKING:
+ *
+ * The subsystem keeps two global lists, dma_device_list and dma_client_list.
+ * Both of these are protected by a mutex, dma_list_mutex.
+ *
+ * Each device has a channels list, which runs unlocked but is never modified
+ * once the device is registered, it's just setup by the driver.
+ *
+ * Each client has a channels list, it's only modified under the client->lock
+ * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ *
+ * Each device has a kref, which is initialized to 1 when the device is
+ * registered. A kref_put is done for each class_device registered. When the
+ * class_device is released, the coresponding kref_put is done in the release
+ * method. Every time one of the device's channels is allocated to a client,
+ * a kref_get occurs. When the channel is freed, the coresponding kref_put
+ * happens. The device's release function does a completion, so
+ * unregister_device does a remove event, class_device_unregister, a kref_put
+ * for the first reference, then waits on the completion for all other
+ * references to finish.
+ *
+ * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
+ * with a kref and a per_cpu local_t. A single reference is set when on an
+ * ADDED event, and removed with a REMOVE event. Net DMA client takes an
+ * extra reference per outstanding transaction. The relase function does a
+ * kref_put on the device. -ChrisL
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/mutex.h>
+
+static DEFINE_MUTEX(dma_list_mutex);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ unsigned long count = 0;
+ int i;
+
+ for_each_cpu(i)
+ count += per_cpu_ptr(chan->local, i)->memcpy_count;
+
+ return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ unsigned long count = 0;
+ int i;
+
+ for_each_cpu(i)
+ count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+
+ return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_in_use(struct class_device *cd, char *buf)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+
+ return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+}
+
+static struct class_device_attribute dma_class_attrs[] = {
+ __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+ __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+ __ATTR(in_use, S_IRUGO, show_in_use, NULL),
+ __ATTR_NULL
+};
+
+static void dma_async_device_cleanup(struct kref *kref);
+
+static void dma_class_dev_release(struct class_device *cd)
+{
+ struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+ kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static struct class dma_devclass = {
+ .name = "dma",
+ .class_dev_attrs = dma_class_attrs,
+ .release = dma_class_dev_release,
+};
+
+/* --- client and device registration --- */
+
+/**
+ * dma_client_chan_alloc - try to allocate a channel to a client
+ * @client: &dma_client
+ *
+ * Called with dma_list_mutex held.
+ */
+static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+{
+ struct dma_device *device;
+ struct dma_chan *chan;
+ unsigned long flags;
+ int desc; /* allocated descriptor count */
+
+ /* Find a channel, any DMA engine will do */
+ list_for_each_entry(device, &dma_device_list, global_node) {
+ list_for_each_entry(chan, &device->channels, device_node) {
+ if (chan->client)
+ continue;
+
+ desc = chan->device->device_alloc_chan_resources(chan);
+ if (desc >= 0) {
+ kref_get(&device->refcount);
+ kref_init(&chan->refcount);
+ chan->slow_ref = 0;
+ INIT_RCU_HEAD(&chan->rcu);
+ chan->client = client;
+ spin_lock_irqsave(&client->lock, flags);
+ list_add_tail_rcu(&chan->client_node,
+ &client->channels);
+ spin_unlock_irqrestore(&client->lock, flags);
+ return chan;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * dma_client_chan_free - release a DMA channel
+ * @chan: &dma_chan
+ */
+void dma_chan_cleanup(struct kref *kref)
+{
+ struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
+ chan->device->device_free_chan_resources(chan);
+ chan->client = NULL;
+ kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static void dma_chan_free_rcu(struct rcu_head *rcu)
+{
+ struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
+ int bias = 0x7FFFFFFF;
+ int i;
+ for_each_cpu(i)
+ bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
+ atomic_sub(bias, &chan->refcount.refcount);
+ kref_put(&chan->refcount, dma_chan_cleanup);
+}
+
+static void dma_client_chan_free(struct dma_chan *chan)
+{
+ atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
+ chan->slow_ref = 1;
+ call_rcu(&chan->rcu, dma_chan_free_rcu);
+}
+
+/**
+ * dma_chans_rebalance - reallocate channels to clients
+ *
+ * When the number of DMA channel in the system changes,
+ * channels need to be rebalanced among clients
+ */
+static void dma_chans_rebalance(void)
+{
+ struct dma_client *client;
+ struct dma_chan *chan;
+ unsigned long flags;
+
+ mutex_lock(&dma_list_mutex);
+
+ list_for_each_entry(client, &dma_client_list, global_node) {
+ while (client->chans_desired > client->chan_count) {
+ chan = dma_client_chan_alloc(client);
+ if (!chan)
+ break;
+ client->chan_count++;
+ client->event_callback(client,
+ chan,
+ DMA_RESOURCE_ADDED);
+ }
+ while (client->chans_desired < client->chan_count) {
+ spin_lock_irqsave(&client->lock, flags);
+ chan = list_entry(client->channels.next,
+ struct dma_chan,
+ client_node);
+ list_del_rcu(&chan->client_node);
+ spin_unlock_irqrestore(&client->lock, flags);
+ client->chan_count--;
+ client->event_callback(client,
+ chan,
+ DMA_RESOURCE_REMOVED);
+ dma_client_chan_free(chan);
+ }
+ }
+
+ mutex_unlock(&dma_list_mutex);
+}
+
+/**
+ * dma_async_client_register - allocate and register a &dma_client
+ * @event_callback: callback for notification of channel addition/removal
+ */
+struct dma_client *dma_async_client_register(dma_event_callback event_callback)
+{
+ struct dma_client *client;
+
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (!client)
+ return NULL;
+
+ INIT_LIST_HEAD(&client->channels);
+ spin_lock_init(&client->lock);
+ client->chans_desired = 0;
+ client->chan_count = 0;
+ client->event_callback = event_callback;
+
+ mutex_lock(&dma_list_mutex);
+ list_add_tail(&client->global_node, &dma_client_list);
+ mutex_unlock(&dma_list_mutex);
+
+ return client;
+}
+
+/**
+ * dma_async_client_unregister - unregister a client and free the &dma_client
+ * @client:
+ *
+ * Force frees any allocated DMA channels, frees the &dma_client memory
+ */
+void dma_async_client_unregister(struct dma_client *client)
+{
+ struct dma_chan *chan;
+
+ if (!client)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(chan, &client->channels, client_node)
+ dma_client_chan_free(chan);
+ rcu_read_unlock();
+
+ mutex_lock(&dma_list_mutex);
+ list_del(&client->global_node);
+ mutex_unlock(&dma_list_mutex);
+
+ kfree(client);
+ dma_chans_rebalance();
+}
+
+/**
+ * dma_async_client_chan_request - request DMA channels
+ * @client: &dma_client
+ * @number: count of DMA channels requested
+ *
+ * Clients call dma_async_client_chan_request() to specify how many
+ * DMA channels they need, 0 to free all currently allocated.
+ * The resulting allocations/frees are indicated to the client via the
+ * event callback.
+ */
+void dma_async_client_chan_request(struct dma_client *client,
+ unsigned int number)
+{
+ client->chans_desired = number;
+ dma_chans_rebalance();
+}
+
+/**
+ * dma_async_device_register -
+ * @device: &dma_device
+ */
+int dma_async_device_register(struct dma_device *device)
+{
+ static int id;
+ int chancnt = 0;
+ struct dma_chan* chan;
+
+ if (!device)
+ return -ENODEV;
+
+ init_completion(&device->done);
+ kref_init(&device->refcount);
+ device->dev_id = id++;
+
+ /* represent channels in sysfs. Probably want devs too */
+ list_for_each_entry(chan, &device->channels, device_node) {
+ chan->local = alloc_percpu(typeof(*chan->local));
+ if (chan->local == NULL)
+ continue;
+
+ chan->chan_id = chancnt++;
+ chan->class_dev.class = &dma_devclass;
+ chan->class_dev.dev = NULL;
+ snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d",
+ device->dev_id, chan->chan_id);
+
+ kref_get(&device->refcount);
+ class_device_register(&chan->class_dev);
+ }
+
+ mutex_lock(&dma_list_mutex);
+ list_add_tail(&device->global_node, &dma_device_list);
+ mutex_unlock(&dma_list_mutex);
+
+ dma_chans_rebalance();
+
+ return 0;
+}
+
+/**
+ * dma_async_device_unregister -
+ * @device: &dma_device
+ */
+static void dma_async_device_cleanup(struct kref *kref)
+{
+ struct dma_device *device;
+
+ device = container_of(kref, struct dma_device, refcount);
+ complete(&device->done);
+}
+
+void dma_async_device_unregister(struct dma_device* device)
+{
+ struct dma_chan *chan;
+ unsigned long flags;
+
+ mutex_lock(&dma_list_mutex);
+ list_del(&device->global_node);
+ mutex_unlock(&dma_list_mutex);
+
+ list_for_each_entry(chan, &device->channels, device_node) {
+ if (chan->client) {
+ spin_lock_irqsave(&chan->client->lock, flags);
+ list_del(&chan->client_node);
+ chan->client->chan_count--;
+ spin_unlock_irqrestore(&chan->client->lock, flags);
+ chan->client->event_callback(chan->client,
+ chan,
+ DMA_RESOURCE_REMOVED);
+ dma_client_chan_free(chan);
+ }
+ class_device_unregister(&chan->class_dev);
+ }
+ dma_chans_rebalance();
+
+ kref_put(&device->refcount, dma_async_device_cleanup);
+ wait_for_completion(&device->done);
+}
+
+static int __init dma_bus_init(void)
+{
+ mutex_init(&dma_list_mutex);
+ return class_register(&dma_devclass);
+}
+
+subsys_initcall(dma_bus_init);
+
+EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dma_async_client_chan_request);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_complete);
+EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_device_register);
+EXPORT_SYMBOL(dma_async_device_unregister);
+EXPORT_SYMBOL(dma_chan_cleanup);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
new file mode 100644
index 0000000..3078154
--- /dev/null
+++ b/include/linux/dmaengine.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef DMAENGINE_H
+#define DMAENGINE_H
+#include <linux/config.h>
+#ifdef CONFIG_DMA_ENGINE
+
+#include <linux/device.h>
+#include <linux/uio.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/rcupdate.h>
+
+/**
+ * enum dma_event - resource PNP/power managment events
+ * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
+ * @DMA_RESOURCE_RESUME: DMA device returning to full power
+ * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_REMOVED: DMA device removed from the system
+ */
+enum dma_event {
+ DMA_RESOURCE_SUSPEND,
+ DMA_RESOURCE_RESUME,
+ DMA_RESOURCE_ADDED,
+ DMA_RESOURCE_REMOVED,
+};
+
+/**
+ * typedef dma_cookie_t
+ *
+ * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
+ */
+typedef s32 dma_cookie_t;
+
+#define dma_submit_error(cookie) ((cookie) < 0 ? 1 : 0)
+
+/**
+ * enum dma_status - DMA transaction status
+ * @DMA_SUCCESS: transaction completed successfully
+ * @DMA_IN_PROGRESS: transaction not yet processed
+ * @DMA_ERROR: transaction failed
+ */
+enum dma_status {
+ DMA_SUCCESS,
+ DMA_IN_PROGRESS,
+ DMA_ERROR,
+};
+
+/**
+ * struct dma_chan_percpu - the per-CPU part of struct dma_chan
+ * @refcount: local_t used for open-coded "bigref" counting
+ * @memcpy_count: transaction counter
+ * @bytes_transferred: byte counter
+ */
+
+struct dma_chan_percpu {
+ local_t refcount;
+ /* stats */
+ unsigned long memcpy_count;
+ unsigned long bytes_transferred;
+};
+
+/**
+ * struct dma_chan - devices supply DMA channels, clients use them
+ * @client: ptr to the client user of this chan, will be NULL when unused
+ * @device: ptr to the dma device who supplies this channel, always !NULL
+ * @cookie: last cookie value returned to client
+ * @chan_id:
+ * @class_dev:
+ * @refcount: kref, used in "bigref" slow-mode
+ * @slow_ref:
+ * @rcu:
+ * @client_node: used to add this to the client chan list
+ * @device_node: used to add this to the device chan list
+ * @local: per-cpu pointer to a struct dma_chan_percpu
+ */
+struct dma_chan {
+ struct dma_client *client;
+ struct dma_device *device;
+ dma_cookie_t cookie;
+
+ /* sysfs */
+ int chan_id;
+ struct class_device class_dev;
+
+ struct kref refcount;
+ int slow_ref;
+ struct rcu_head rcu;
+
+ struct list_head client_node;
+ struct list_head device_node;
+ struct dma_chan_percpu *local;
+};
+
+void dma_chan_cleanup(struct kref *kref);
+
+static inline void dma_chan_get(struct dma_chan *chan)
+{
+ if (unlikely(chan->slow_ref))
+ kref_get(&chan->refcount);
+ else {
+ local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+ put_cpu();
+ }
+}
+
+static inline void dma_chan_put(struct dma_chan *chan)
+{
+ if (unlikely(chan->slow_ref))
+ kref_put(&chan->refcount, dma_chan_cleanup);
+ else {
+ local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+ put_cpu();
+ }
+}
+
+/*
+ * typedef dma_event_callback - function pointer to a DMA event callback
+ */
+typedef void (*dma_event_callback) (struct dma_client *client,
+ struct dma_chan *chan, enum dma_event event);
+
+/**
+ * struct dma_client - info on the entity making use of DMA services
+ * @event_callback: func ptr to call when something happens
+ * @chan_count: number of chans allocated
+ * @chans_desired: number of chans requested. Can be +/- chan_count
+ * @lock: protects access to the channels list
+ * @channels: the list of DMA channels allocated
+ * @global_node: list_head for global dma_client_list
+ */
+struct dma_client {
+ dma_event_callback event_callback;
+ unsigned int chan_count;
+ unsigned int chans_desired;
+
+ spinlock_t lock;
+ struct list_head channels;
+ struct list_head global_node;
+};
+
+/**
+ * struct dma_device - info on the entity supplying DMA services
+ * @chancnt: how many DMA channels are supported
+ * @channels: the list of struct dma_chan
+ * @global_node: list_head for global dma_device_list
+ * @refcount:
+ * @done:
+ * @dev_id:
+ * Other func ptrs: used to make use of this device's capabilities
+ */
+struct dma_device {
+
+ unsigned int chancnt;
+ struct list_head channels;
+ struct list_head global_node;
+
+ struct kref refcount;
+ struct completion done;
+
+ int dev_id;
+
+ int (*device_alloc_chan_resources)(struct dma_chan *chan);
+ void (*device_free_chan_resources)(struct dma_chan *chan);
+ dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
+ void *dest, void *src, size_t len);
+ dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
+ struct page *page, unsigned int offset, void *kdata,
+ size_t len);
+ dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
+ struct page *dest_pg, unsigned int dest_off,
+ struct page *src_pg, unsigned int src_off, size_t len);
+ enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+ dma_cookie_t cookie, dma_cookie_t *last,
+ dma_cookie_t *used);
+ void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+};
+
+/* --- public DMA engine API --- */
+
+struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_unregister(struct dma_client *client);
+void dma_async_client_chan_request(struct dma_client *client,
+ unsigned int number);
+
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+ void *dest, void *src, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+ struct page *page, unsigned int offset, void *kdata, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
+ kdata, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @dest_page: destination page
+ * @dest_off: offset in page to copy to
+ * @src_page: source page
+ * @src_off: offset in page to copy from
+ * @len: length
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+ struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+ unsigned int src_off, size_t len)
+{
+ int cpu = get_cpu();
+ per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+ per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+ put_cpu();
+
+ return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
+ src_pg, src_off, len);
+}
+
+/**
+ * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * @chan:
+ *
+ * This allows drivers to push copies to HW in batches,
+ * reducing MMIO writes where possible.
+ */
+static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+{
+ return chan->device->device_memcpy_issue_pending(chan);
+}
+
+/**
+ * dma_async_memcpy_complete - poll for transaction completion
+ * @chan: DMA channel
+ * @cookie: transaction identifier to check status of
+ * @last: returns last completed cookie, can be NULL
+ * @used: returns last issued cookie, can be NULL
+ *
+ * If @last and @used are passed in, upon return they reflect the driver
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
+ */
+static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+ dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+{
+ return chan->device->device_memcpy_complete(chan, cookie, last, used);
+}
+
+/**
+ * dma_async_is_complete - test a cookie against chan state
+ * @cookie: transaction identifier to test status of
+ * @last_complete: last know completed transaction
+ * @last_used: last cookie value handed out
+ *
+ * dma_async_is_complete() is used in dma_async_memcpy_complete()
+ * the test logic is seperated for lightweight testing of multiple cookies
+ */
+static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
+ dma_cookie_t last_complete, dma_cookie_t last_used)
+{
+ if (last_complete <= last_used) {
+ if ((cookie <= last_complete) || (cookie > last_used))
+ return DMA_SUCCESS;
+ } else {
+ if ((cookie <= last_complete) && (cookie > last_used))
+ return DMA_SUCCESS;
+ }
+ return DMA_IN_PROGRESS;
+}
+
+
+/* --- DMA device --- */
+
+int dma_async_device_register(struct dma_device *device);
+void dma_async_device_unregister(struct dma_device *device);
+
+#endif /* CONFIG_DMA_ENGINE */
+#endif /* DMAENGINE_H */
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
2006-05-24 0:20 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 0:56 ` Andrew Morton
` (2 more replies)
2006-05-24 0:20 ` [PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client Chris Leech
` (7 subsequent siblings)
9 siblings, 3 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Adds a new ioatdma driver
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
drivers/dma/Kconfig | 9
drivers/dma/Makefile | 1
drivers/dma/ioatdma.c | 839 +++++++++++++++++++++++++++++++++++++++
drivers/dma/ioatdma.h | 126 ++++++
drivers/dma/ioatdma_hw.h | 52 ++
drivers/dma/ioatdma_io.h | 118 +++++
drivers/dma/ioatdma_registers.h | 126 ++++++
7 files changed, 1271 insertions(+), 0 deletions(-)
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index f9ac4bc..0f15e76 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,4 +10,13 @@ config DMA_ENGINE
DMA engines offload copy operations from the CPU to dedicated
hardware, allowing the copies to happen asynchronously.
+comment "DMA Devices"
+
+config INTEL_IOATDMA
+ tristate "Intel I/OAT DMA support"
+ depends on DMA_ENGINE && PCI
+ default m
+ ---help---
+ Enable support for the Intel(R) I/OAT DMA engine.
+
endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 10b7391..c8a5f56 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1 +1,2 @@
obj-y += dmaengine.o
+obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
new file mode 100644
index 0000000..11d48b9
--- /dev/null
+++ b/drivers/dma/ioatdma.c
@@ -0,0 +1,839 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine, which does asynchronous
+ * copy operations.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include "ioatdma.h"
+#include "ioatdma_io.h"
+#include "ioatdma_registers.h"
+#include "ioatdma_hw.h"
+
+#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common)
+#define to_ioat_device(dev) container_of(dev, struct ioat_device, common)
+#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
+
+/* internal functions */
+static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
+static void __devexit ioat_remove(struct pci_dev *pdev);
+
+static int enumerate_dma_channels(struct ioat_device *device)
+{
+ u8 xfercap_scale;
+ u32 xfercap;
+ int i;
+ struct ioat_dma_chan *ioat_chan;
+
+ device->common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET);
+ xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET);
+ xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
+
+ for (i = 0; i < device->common.chancnt; i++) {
+ ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL);
+ if (!ioat_chan) {
+ device->common.chancnt = i;
+ break;
+ }
+
+ ioat_chan->device = device;
+ ioat_chan->reg_base = device->reg_base + (0x80 * (i + 1));
+ ioat_chan->xfercap = xfercap;
+ spin_lock_init(&ioat_chan->cleanup_lock);
+ spin_lock_init(&ioat_chan->desc_lock);
+ INIT_LIST_HEAD(&ioat_chan->free_desc);
+ INIT_LIST_HEAD(&ioat_chan->used_desc);
+ /* This should be made common somewhere in dmaengine.c */
+ ioat_chan->common.device = &device->common;
+ ioat_chan->common.client = NULL;
+ list_add_tail(&ioat_chan->common.device_node,
+ &device->common.channels);
+ }
+ return device->common.chancnt;
+}
+
+static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
+ struct ioat_dma_chan *ioat_chan,
+ int flags)
+{
+ struct ioat_dma_descriptor *desc;
+ struct ioat_desc_sw *desc_sw;
+ struct ioat_device *ioat_device;
+ dma_addr_t phys;
+
+ ioat_device = to_ioat_device(ioat_chan->common.device);
+ desc = pci_pool_alloc(ioat_device->dma_pool, flags, &phys);
+ if (unlikely(!desc))
+ return NULL;
+
+ desc_sw = kzalloc(sizeof(*desc_sw), flags);
+ if (unlikely(!desc_sw)) {
+ pci_pool_free(ioat_device->dma_pool, desc, phys);
+ return NULL;
+ }
+
+ memset(desc, 0, sizeof(*desc));
+ desc_sw->hw = desc;
+ desc_sw->phys = phys;
+
+ return desc_sw;
+}
+
+#define INITIAL_IOAT_DESC_COUNT 128
+
+static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan);
+
+/* returns the actual number of allocated descriptors */
+static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+ struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+ struct ioat_desc_sw *desc = NULL;
+ u16 chanctrl;
+ u32 chanerr;
+ int i;
+ LIST_HEAD(tmp_list);
+
+ /*
+ * In-use bit automatically set by reading chanctrl
+ * If 0, we got it, if 1, someone else did
+ */
+ chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+ if (chanctrl & IOAT_CHANCTRL_CHANNEL_IN_USE)
+ return -EBUSY;
+
+ /* Setup register to interrupt and write completion status on error */
+ chanctrl = IOAT_CHANCTRL_CHANNEL_IN_USE |
+ IOAT_CHANCTRL_ERR_INT_EN |
+ IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
+ IOAT_CHANCTRL_ERR_COMPLETION_EN;
+ ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+
+ chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET);
+ if (chanerr) {
+ printk("IOAT: CHANERR = %x, clearing\n", chanerr);
+ ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr);
+ }
+
+ /* Allocate descriptors */
+ for (i = 0; i < INITIAL_IOAT_DESC_COUNT; i++) {
+ desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL);
+ if (!desc) {
+ printk(KERN_ERR "IOAT: Only %d initial descriptors\n", i);
+ break;
+ }
+ list_add_tail(&desc->node, &tmp_list);
+ }
+ spin_lock_bh(&ioat_chan->desc_lock);
+ list_splice(&tmp_list, &ioat_chan->free_desc);
+ spin_unlock_bh(&ioat_chan->desc_lock);
+
+ /* allocate a completion writeback area */
+ /* doing 2 32bit writes to mmio since 1 64b write doesn't work */
+ ioat_chan->completion_virt =
+ pci_pool_alloc(ioat_chan->device->completion_pool,
+ GFP_KERNEL,
+ &ioat_chan->completion_addr);
+ memset(ioat_chan->completion_virt, 0,
+ sizeof(*ioat_chan->completion_virt));
+ ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW,
+ ((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF);
+ ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH,
+ ((u64) ioat_chan->completion_addr) >> 32);
+
+ ioat_start_null_desc(ioat_chan);
+ return i;
+}
+
+static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan);
+
+static void ioat_dma_free_chan_resources(struct dma_chan *chan)
+{
+ struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+ struct ioat_device *ioat_device = to_ioat_device(chan->device);
+ struct ioat_desc_sw *desc, *_desc;
+ u16 chanctrl;
+ int in_use_descs = 0;
+
+ ioat_dma_memcpy_cleanup(ioat_chan);
+
+ ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET);
+
+ spin_lock_bh(&ioat_chan->desc_lock);
+ list_for_each_entry_safe(desc, _desc, &ioat_chan->used_desc, node) {
+ in_use_descs++;
+ list_del(&desc->node);
+ pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys);
+ kfree(desc);
+ }
+ list_for_each_entry_safe(desc, _desc, &ioat_chan->free_desc, node) {
+ list_del(&desc->node);
+ pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys);
+ kfree(desc);
+ }
+ spin_unlock_bh(&ioat_chan->desc_lock);
+
+ pci_pool_free(ioat_device->completion_pool,
+ ioat_chan->completion_virt,
+ ioat_chan->completion_addr);
+
+ /* one is ok since we left it on there on purpose */
+ if (in_use_descs > 1)
+ printk(KERN_ERR "IOAT: Freeing %d in use descriptors!\n",
+ in_use_descs - 1);
+
+ ioat_chan->last_completion = ioat_chan->completion_addr = 0;
+
+ /* Tell hw the chan is free */
+ chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+ chanctrl &= ~IOAT_CHANCTRL_CHANNEL_IN_USE;
+ ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+}
+
+/**
+ * do_ioat_dma_memcpy - actual function that initiates a IOAT DMA transaction
+ * @chan: IOAT DMA channel handle
+ * @dest: DMA destination address
+ * @src: DMA source address
+ * @len: transaction length in bytes
+ */
+
+static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan,
+ dma_addr_t dest,
+ dma_addr_t src,
+ size_t len)
+{
+ struct ioat_desc_sw *first;
+ struct ioat_desc_sw *prev;
+ struct ioat_desc_sw *new;
+ dma_cookie_t cookie;
+ LIST_HEAD(new_chain);
+ u32 copy;
+ size_t orig_len;
+ dma_addr_t orig_src, orig_dst;
+ unsigned int desc_count = 0;
+ unsigned int append = 0;
+
+ if (!ioat_chan || !dest || !src)
+ return -EFAULT;
+
+ if (!len)
+ return ioat_chan->common.cookie;
+
+ orig_len = len;
+ orig_src = src;
+ orig_dst = dest;
+
+ first = NULL;
+ prev = NULL;
+
+ spin_lock_bh(&ioat_chan->desc_lock);
+
+ while (len) {
+ if (!list_empty(&ioat_chan->free_desc)) {
+ new = to_ioat_desc(ioat_chan->free_desc.next);
+ list_del(&new->node);
+ } else {
+ /* try to get another desc */
+ new = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC);
+ /* will this ever happen? */
+ /* TODO add upper limit on these */
+ BUG_ON(!new);
+ }
+
+ copy = min((u32) len, ioat_chan->xfercap);
+
+ new->hw->size = copy;
+ new->hw->ctl = 0;
+ new->hw->src_addr = src;
+ new->hw->dst_addr = dest;
+ new->cookie = 0;
+
+ /* chain together the physical address list for the HW */
+ if (!first)
+ first = new;
+ else
+ prev->hw->next = (u64) new->phys;
+
+ prev = new;
+
+ len -= copy;
+ dest += copy;
+ src += copy;
+
+ list_add_tail(&new->node, &new_chain);
+ desc_count++;
+ }
+ new->hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
+ new->hw->next = 0;
+
+ /* cookie incr and addition to used_list must be atomic */
+
+ cookie = ioat_chan->common.cookie;
+ cookie++;
+ if (cookie < 0)
+ cookie = 1;
+ ioat_chan->common.cookie = new->cookie = cookie;
+
+ pci_unmap_addr_set(new, src, orig_src);
+ pci_unmap_addr_set(new, dst, orig_dst);
+ pci_unmap_len_set(new, src_len, orig_len);
+ pci_unmap_len_set(new, dst_len, orig_len);
+
+ /* write address into NextDescriptor field of last desc in chain */
+ to_ioat_desc(ioat_chan->used_desc.prev)->hw->next = first->phys;
+ list_splice_init(&new_chain, ioat_chan->used_desc.prev);
+
+ ioat_chan->pending += desc_count;
+ if (ioat_chan->pending >= 20) {
+ append = 1;
+ ioat_chan->pending = 0;
+ }
+
+ spin_unlock_bh(&ioat_chan->desc_lock);
+
+ if (append)
+ ioatdma_chan_write8(ioat_chan,
+ IOAT_CHANCMD_OFFSET,
+ IOAT_CHANCMD_APPEND);
+ return cookie;
+}
+
+/**
+ * ioat_dma_memcpy_buf_to_buf - wrapper that takes src & dest bufs
+ * @chan: IOAT DMA channel handle
+ * @dest: DMA destination address
+ * @src: DMA source address
+ * @len: transaction length in bytes
+ */
+
+static dma_cookie_t ioat_dma_memcpy_buf_to_buf(struct dma_chan *chan,
+ void *dest,
+ void *src,
+ size_t len)
+{
+ dma_addr_t dest_addr;
+ dma_addr_t src_addr;
+ struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+
+ dest_addr = pci_map_single(ioat_chan->device->pdev,
+ dest, len, PCI_DMA_FROMDEVICE);
+ src_addr = pci_map_single(ioat_chan->device->pdev,
+ src, len, PCI_DMA_TODEVICE);
+
+ return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
+}
+
+/**
+ * ioat_dma_memcpy_buf_to_pg - wrapper, copying from a buf to a page
+ * @chan: IOAT DMA channel handle
+ * @page: pointer to the page to copy to
+ * @offset: offset into that page
+ * @src: DMA source address
+ * @len: transaction length in bytes
+ */
+
+static dma_cookie_t ioat_dma_memcpy_buf_to_pg(struct dma_chan *chan,
+ struct page *page,
+ unsigned int offset,
+ void *src,
+ size_t len)
+{
+ dma_addr_t dest_addr;
+ dma_addr_t src_addr;
+ struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+
+ dest_addr = pci_map_page(ioat_chan->device->pdev,
+ page, offset, len, PCI_DMA_FROMDEVICE);
+ src_addr = pci_map_single(ioat_chan->device->pdev,
+ src, len, PCI_DMA_TODEVICE);
+
+ return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
+}
+
+/**
+ * ioat_dma_memcpy_pg_to_pg - wrapper, copying between two pages
+ * @chan: IOAT DMA channel handle
+ * @dest_pg: pointer to the page to copy to
+ * @dest_off: offset into that page
+ * @src_pg: pointer to the page to copy from
+ * @src_off: offset into that page
+ * @len: transaction length in bytes. This is guaranteed to not make a copy
+ * across a page boundary.
+ */
+
+static dma_cookie_t ioat_dma_memcpy_pg_to_pg(struct dma_chan *chan,
+ struct page *dest_pg,
+ unsigned int dest_off,
+ struct page *src_pg,
+ unsigned int src_off,
+ size_t len)
+{
+ dma_addr_t dest_addr;
+ dma_addr_t src_addr;
+ struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+
+ dest_addr = pci_map_page(ioat_chan->device->pdev,
+ dest_pg, dest_off, len, PCI_DMA_FROMDEVICE);
+ src_addr = pci_map_page(ioat_chan->device->pdev,
+ src_pg, src_off, len, PCI_DMA_TODEVICE);
+
+ return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
+}
+
+/**
+ * ioat_dma_memcpy_issue_pending - push potentially unrecognoized appended descriptors to hw
+ * @chan: DMA channel handle
+ */
+
+static void ioat_dma_memcpy_issue_pending(struct dma_chan *chan)
+{
+ struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+
+ if (ioat_chan->pending != 0) {
+ ioat_chan->pending = 0;
+ ioatdma_chan_write8(ioat_chan,
+ IOAT_CHANCMD_OFFSET,
+ IOAT_CHANCMD_APPEND);
+ }
+}
+
+static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
+{
+ unsigned long phys_complete;
+ struct ioat_desc_sw *desc, *_desc;
+ dma_cookie_t cookie = 0;
+
+ prefetch(chan->completion_virt);
+
+ if (!spin_trylock(&chan->cleanup_lock))
+ return;
+
+ /* The completion writeback can happen at any time,
+ so reads by the driver need to be atomic operations
+ The descriptor physical addresses are limited to 32-bits
+ when the CPU can only do a 32-bit mov */
+
+#if (BITS_PER_LONG == 64)
+ phys_complete =
+ chan->completion_virt->full & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
+#else
+ phys_complete = chan->completion_virt->low & IOAT_LOW_COMPLETION_MASK;
+#endif
+
+ if ((chan->completion_virt->full & IOAT_CHANSTS_DMA_TRANSFER_STATUS) ==
+ IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) {
+ printk("IOAT: Channel halted, chanerr = %x\n",
+ ioatdma_chan_read32(chan, IOAT_CHANERR_OFFSET));
+
+ /* TODO do something to salvage the situation */
+ }
+
+ if (phys_complete == chan->last_completion) {
+ spin_unlock(&chan->cleanup_lock);
+ return;
+ }
+
+ spin_lock_bh(&chan->desc_lock);
+ list_for_each_entry_safe(desc, _desc, &chan->used_desc, node) {
+
+ /*
+ * Incoming DMA requests may use multiple descriptors, due to
+ * exceeding xfercap, perhaps. If so, only the last one will
+ * have a cookie, and require unmapping.
+ */
+ if (desc->cookie) {
+ cookie = desc->cookie;
+
+ /* yes we are unmapping both _page and _single alloc'd
+ regions with unmap_page. Is this *really* that bad?
+ */
+ pci_unmap_page(chan->device->pdev,
+ pci_unmap_addr(desc, dst),
+ pci_unmap_len(desc, dst_len),
+ PCI_DMA_FROMDEVICE);
+ pci_unmap_page(chan->device->pdev,
+ pci_unmap_addr(desc, src),
+ pci_unmap_len(desc, src_len),
+ PCI_DMA_TODEVICE);
+ }
+
+ if (desc->phys != phys_complete) {
+ /* a completed entry, but not the last, so cleanup */
+ list_del(&desc->node);
+ list_add_tail(&desc->node, &chan->free_desc);
+ } else {
+ /* last used desc. Do not remove, so we can append from
+ it, but don't look at it next time, either */
+ desc->cookie = 0;
+
+ /* TODO check status bits? */
+ break;
+ }
+ }
+
+ spin_unlock_bh(&chan->desc_lock);
+
+ chan->last_completion = phys_complete;
+ if (cookie != 0)
+ chan->completed_cookie = cookie;
+
+ spin_unlock(&chan->cleanup_lock);
+}
+
+/**
+ * ioat_dma_is_complete - poll the status of a IOAT DMA transaction
+ * @chan: IOAT DMA channel handle
+ * @cookie: DMA transaction identifier
+ */
+
+static enum dma_status ioat_dma_is_complete(struct dma_chan *chan,
+ dma_cookie_t cookie,
+ dma_cookie_t *done,
+ dma_cookie_t *used)
+{
+ struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+ dma_cookie_t last_used;
+ dma_cookie_t last_complete;
+ enum dma_status ret;
+
+ last_used = chan->cookie;
+ last_complete = ioat_chan->completed_cookie;
+
+ if (done)
+ *done= last_complete;
+ if (used)
+ *used = last_used;
+
+ ret = dma_async_is_complete(cookie, last_complete, last_used);
+ if (ret == DMA_SUCCESS)
+ return ret;
+
+ ioat_dma_memcpy_cleanup(ioat_chan);
+
+ last_used = chan->cookie;
+ last_complete = ioat_chan->completed_cookie;
+
+ if (done)
+ *done= last_complete;
+ if (used)
+ *used = last_used;
+
+ return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+/* PCI API */
+
+static struct pci_device_id ioat_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
+ { 0, }
+};
+
+static struct pci_driver ioat_pci_drv = {
+ .name = "ioatdma",
+ .id_table = ioat_pci_tbl,
+ .probe = ioat_probe,
+ .remove = __devexit_p(ioat_remove),
+};
+
+static irqreturn_t ioat_do_interrupt(int irq, void *data, struct pt_regs *regs)
+{
+ struct ioat_device *instance = data;
+ unsigned long attnstatus;
+ u8 intrctrl;
+
+ intrctrl = ioatdma_read8(instance, IOAT_INTRCTRL_OFFSET);
+
+ if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
+ return IRQ_NONE;
+
+ if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
+ ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl);
+ return IRQ_NONE;
+ }
+
+ attnstatus = ioatdma_read32(instance, IOAT_ATTNSTATUS_OFFSET);
+
+ printk(KERN_ERR "ioatdma error: interrupt! status %lx\n", attnstatus);
+
+ ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl);
+ return IRQ_HANDLED;
+}
+
+static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan)
+{
+ struct ioat_desc_sw *desc;
+
+ spin_lock_bh(&ioat_chan->desc_lock);
+
+ if (!list_empty(&ioat_chan->free_desc)) {
+ desc = to_ioat_desc(ioat_chan->free_desc.next);
+ list_del(&desc->node);
+ } else {
+ /* try to get another desc */
+ spin_unlock_bh(&ioat_chan->desc_lock);
+ desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL);
+ spin_lock_bh(&ioat_chan->desc_lock);
+ /* will this ever happen? */
+ BUG_ON(!desc);
+ }
+
+ desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL;
+ desc->hw->next = 0;
+
+ list_add_tail(&desc->node, &ioat_chan->used_desc);
+ spin_unlock_bh(&ioat_chan->desc_lock);
+
+#if (BITS_PER_LONG == 64)
+ ioatdma_chan_write64(ioat_chan, IOAT_CHAINADDR_OFFSET, desc->phys);
+#else
+ ioatdma_chan_write32(ioat_chan,
+ IOAT_CHAINADDR_OFFSET_LOW,
+ (u32) desc->phys);
+ ioatdma_chan_write32(ioat_chan, IOAT_CHAINADDR_OFFSET_HIGH, 0);
+#endif
+ ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_START);
+}
+
+/*
+ * Perform a IOAT transaction to verify the HW works.
+ */
+#define IOAT_TEST_SIZE 2000
+
+static int ioat_self_test(struct ioat_device *device)
+{
+ int i;
+ u8 *src;
+ u8 *dest;
+ struct dma_chan *dma_chan;
+ dma_cookie_t cookie;
+ int err = 0;
+
+ src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL);
+ if (!src)
+ return -ENOMEM;
+ dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL);
+ if (!dest) {
+ kfree(src);
+ return -ENOMEM;
+ }
+
+ /* Fill in src buffer */
+ for (i = 0; i < IOAT_TEST_SIZE; i++)
+ src[i] = (u8)i;
+
+ /* Start copy, using first DMA channel */
+ dma_chan = container_of(device->common.channels.next,
+ struct dma_chan,
+ device_node);
+ if (ioat_dma_alloc_chan_resources(dma_chan) < 1) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ cookie = ioat_dma_memcpy_buf_to_buf(dma_chan, dest, src, IOAT_TEST_SIZE);
+ ioat_dma_memcpy_issue_pending(dma_chan);
+ msleep(1);
+
+ if (ioat_dma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+ printk(KERN_ERR "ioatdma: Self-test copy timed out, disabling\n");
+ err = -ENODEV;
+ goto free_resources;
+ }
+ if (memcmp(src, dest, IOAT_TEST_SIZE)) {
+ printk(KERN_ERR "ioatdma: Self-test copy failed compare, disabling\n");
+ err = -ENODEV;
+ goto free_resources;
+ }
+
+free_resources:
+ ioat_dma_free_chan_resources(dma_chan);
+out:
+ kfree(src);
+ kfree(dest);
+ return err;
+}
+
+static int __devinit ioat_probe(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int err;
+ unsigned long mmio_start, mmio_len;
+ void *reg_base;
+ struct ioat_device *device;
+
+ err = pci_enable_device(pdev);
+ if (err)
+ goto err_enable_device;
+
+ err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+ if (err)
+ err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+ if (err)
+ goto err_set_dma_mask;
+
+ err = pci_request_regions(pdev, ioat_pci_drv.name);
+ if (err)
+ goto err_request_regions;
+
+ mmio_start = pci_resource_start(pdev, 0);
+ mmio_len = pci_resource_len(pdev, 0);
+
+ reg_base = ioremap(mmio_start, mmio_len);
+ if (!reg_base) {
+ err = -ENOMEM;
+ goto err_ioremap;
+ }
+
+ device = kzalloc(sizeof(*device), GFP_KERNEL);
+ if (!device) {
+ err = -ENOMEM;
+ goto err_kzalloc;
+ }
+
+ /* DMA coherent memory pool for DMA descriptor allocations */
+ device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
+ sizeof(struct ioat_dma_descriptor), 64, 0);
+ if (!device->dma_pool) {
+ err = -ENOMEM;
+ goto err_dma_pool;
+ }
+
+ device->completion_pool = pci_pool_create("completion_pool", pdev, sizeof(u64), SMP_CACHE_BYTES, SMP_CACHE_BYTES);
+ if (!device->completion_pool) {
+ err = -ENOMEM;
+ goto err_completion_pool;
+ }
+
+ device->pdev = pdev;
+ pci_set_drvdata(pdev, device);
+#ifdef CONFIG_PCI_MSI
+ if (pci_enable_msi(pdev) == 0) {
+ device->msi = 1;
+ } else {
+ device->msi = 0;
+ }
+#endif
+ err = request_irq(pdev->irq, &ioat_do_interrupt, SA_SHIRQ, "ioat",
+ device);
+ if (err)
+ goto err_irq;
+
+ device->reg_base = reg_base;
+
+ ioatdma_write8(device, IOAT_INTRCTRL_OFFSET, IOAT_INTRCTRL_MASTER_INT_EN);
+ pci_set_master(pdev);
+
+ INIT_LIST_HEAD(&device->common.channels);
+ enumerate_dma_channels(device);
+
+ device->common.device_alloc_chan_resources = ioat_dma_alloc_chan_resources;
+ device->common.device_free_chan_resources = ioat_dma_free_chan_resources;
+ device->common.device_memcpy_buf_to_buf = ioat_dma_memcpy_buf_to_buf;
+ device->common.device_memcpy_buf_to_pg = ioat_dma_memcpy_buf_to_pg;
+ device->common.device_memcpy_pg_to_pg = ioat_dma_memcpy_pg_to_pg;
+ device->common.device_memcpy_complete = ioat_dma_is_complete;
+ device->common.device_memcpy_issue_pending = ioat_dma_memcpy_issue_pending;
+ printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n",
+ device->common.chancnt);
+
+ err = ioat_self_test(device);
+ if (err)
+ goto err_self_test;
+
+ dma_async_device_register(&device->common);
+
+ return 0;
+
+err_self_test:
+err_irq:
+ pci_pool_destroy(device->completion_pool);
+err_completion_pool:
+ pci_pool_destroy(device->dma_pool);
+err_dma_pool:
+ kfree(device);
+err_kzalloc:
+ iounmap(reg_base);
+err_ioremap:
+ pci_release_regions(pdev);
+err_request_regions:
+err_set_dma_mask:
+ pci_disable_device(pdev);
+err_enable_device:
+ return err;
+}
+
+static void __devexit ioat_remove(struct pci_dev *pdev)
+{
+ struct ioat_device *device;
+ struct dma_chan *chan, *_chan;
+ struct ioat_dma_chan *ioat_chan;
+
+ device = pci_get_drvdata(pdev);
+ dma_async_device_unregister(&device->common);
+
+ free_irq(device->pdev->irq, device);
+#ifdef CONFIG_PCI_MSI
+ if (device->msi)
+ pci_disable_msi(device->pdev);
+#endif
+ pci_pool_destroy(device->dma_pool);
+ pci_pool_destroy(device->completion_pool);
+ iounmap(device->reg_base);
+ pci_release_regions(pdev);
+ pci_disable_device(pdev);
+ list_for_each_entry_safe(chan, _chan, &device->common.channels, device_node) {
+ ioat_chan = to_ioat_chan(chan);
+ list_del(&chan->device_node);
+ kfree(ioat_chan);
+ }
+ kfree(device);
+}
+
+/* MODULE API */
+MODULE_VERSION("1.7");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+static int __init ioat_init_module(void)
+{
+ /* it's currently unsafe to unload this module */
+ /* if forced, worst case is that rmmod hangs */
+ if (THIS_MODULE != NULL)
+ THIS_MODULE->unsafe = 1;
+
+ return pci_module_init(&ioat_pci_drv);
+}
+
+module_init(ioat_init_module);
+
+static void __exit ioat_exit_module(void)
+{
+ pci_unregister_driver(&ioat_pci_drv);
+}
+
+module_exit(ioat_exit_module);
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
new file mode 100644
index 0000000..312353d
--- /dev/null
+++ b/drivers/dma/ioatdma.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef IOATDMA_H
+#define IOATDMA_H
+
+#include <linux/dmaengine.h>
+#include "ioatdma_hw.h"
+#include <linux/init.h>
+#include <linux/dmapool.h>
+#include <linux/cache.h>
+
+#define PCI_DEVICE_ID_INTEL_IOAT 0x1a38
+
+#define IOAT_LOW_COMPLETION_MASK 0xffffffc0
+
+extern struct list_head dma_device_list;
+extern struct list_head dma_client_list;
+
+/**
+ * struct ioat_device - internal representation of a IOAT device
+ * @pdev: PCI-Express device
+ * @reg_base: MMIO register space base address
+ * @dma_pool: for allocating DMA descriptors
+ * @common: embedded struct dma_device
+ * @msi: Message Signaled Interrupt number
+ */
+
+struct ioat_device {
+ struct pci_dev *pdev;
+ void *reg_base;
+ struct pci_pool *dma_pool;
+ struct pci_pool *completion_pool;
+
+ struct dma_device common;
+ u8 msi;
+};
+
+/**
+ * struct ioat_dma_chan - internal representation of a DMA channel
+ * @device:
+ * @reg_base:
+ * @sw_in_use:
+ * @completion:
+ * @completion_low:
+ * @completion_high:
+ * @completed_cookie: last cookie seen completed on cleanup
+ * @cookie: value of last cookie given to client
+ * @last_completion:
+ * @xfercap:
+ * @desc_lock:
+ * @free_desc:
+ * @used_desc:
+ * @resource:
+ * @device_node:
+ */
+
+struct ioat_dma_chan {
+
+ void *reg_base;
+
+ dma_cookie_t completed_cookie;
+ unsigned long last_completion;
+
+ u32 xfercap; /* XFERCAP register value expanded out */
+
+ spinlock_t cleanup_lock;
+ spinlock_t desc_lock;
+ struct list_head free_desc;
+ struct list_head used_desc;
+
+ int pending;
+
+ struct ioat_device *device;
+ struct dma_chan common;
+
+ dma_addr_t completion_addr;
+ union {
+ u64 full; /* HW completion writeback */
+ struct {
+ u32 low;
+ u32 high;
+ };
+ } *completion_virt;
+};
+
+/* wrapper around hardware descriptor format + additional software fields */
+
+/**
+ * struct ioat_desc_sw - wrapper around hardware descriptor
+ * @hw: hardware DMA descriptor
+ * @node:
+ * @cookie:
+ * @phys:
+ */
+
+struct ioat_desc_sw {
+ struct ioat_dma_descriptor *hw;
+ struct list_head node;
+ dma_cookie_t cookie;
+ dma_addr_t phys;
+ DECLARE_PCI_UNMAP_ADDR(src)
+ DECLARE_PCI_UNMAP_LEN(src_len)
+ DECLARE_PCI_UNMAP_ADDR(dst)
+ DECLARE_PCI_UNMAP_LEN(dst_len)
+};
+
+#endif /* IOATDMA_H */
+
diff --git a/drivers/dma/ioatdma_hw.h b/drivers/dma/ioatdma_hw.h
new file mode 100644
index 0000000..4d7a128
--- /dev/null
+++ b/drivers/dma/ioatdma_hw.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef _IOAT_HW_H_
+#define _IOAT_HW_H_
+
+/* PCI Configuration Space Values */
+#define IOAT_PCI_VID 0x8086
+#define IOAT_PCI_DID 0x1A38
+#define IOAT_PCI_RID 0x00
+#define IOAT_PCI_SVID 0x8086
+#define IOAT_PCI_SID 0x8086
+#define IOAT_VER 0x12 /* Version 1.2 */
+
+struct ioat_dma_descriptor {
+ uint32_t size;
+ uint32_t ctl;
+ uint64_t src_addr;
+ uint64_t dst_addr;
+ uint64_t next;
+ uint64_t rsv1;
+ uint64_t rsv2;
+ uint64_t user1;
+ uint64_t user2;
+};
+
+#define IOAT_DMA_DESCRIPTOR_CTL_INT_GN 0x00000001
+#define IOAT_DMA_DESCRIPTOR_CTL_SRC_SN 0x00000002
+#define IOAT_DMA_DESCRIPTOR_CTL_DST_SN 0x00000004
+#define IOAT_DMA_DESCRIPTOR_CTL_CP_STS 0x00000008
+#define IOAT_DMA_DESCRIPTOR_CTL_FRAME 0x00000010
+#define IOAT_DMA_DESCRIPTOR_NUL 0x00000020
+#define IOAT_DMA_DESCRIPTOR_OPCODE 0xFF000000
+
+#endif
diff --git a/drivers/dma/ioatdma_io.h b/drivers/dma/ioatdma_io.h
new file mode 100644
index 0000000..c0b4bf6
--- /dev/null
+++ b/drivers/dma/ioatdma_io.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef IOATDMA_IO_H
+#define IOATDMA_IO_H
+
+#include <asm/io.h>
+
+/*
+ * device and per-channel MMIO register read and write functions
+ * this is a lot of anoying inline functions, but it's typesafe
+ */
+
+static inline u8 ioatdma_read8(struct ioat_device *device,
+ unsigned int offset)
+{
+ return readb(device->reg_base + offset);
+}
+
+static inline u16 ioatdma_read16(struct ioat_device *device,
+ unsigned int offset)
+{
+ return readw(device->reg_base + offset);
+}
+
+static inline u32 ioatdma_read32(struct ioat_device *device,
+ unsigned int offset)
+{
+ return readl(device->reg_base + offset);
+}
+
+static inline void ioatdma_write8(struct ioat_device *device,
+ unsigned int offset, u8 value)
+{
+ writeb(value, device->reg_base + offset);
+}
+
+static inline void ioatdma_write16(struct ioat_device *device,
+ unsigned int offset, u16 value)
+{
+ writew(value, device->reg_base + offset);
+}
+
+static inline void ioatdma_write32(struct ioat_device *device,
+ unsigned int offset, u32 value)
+{
+ writel(value, device->reg_base + offset);
+}
+
+static inline u8 ioatdma_chan_read8(struct ioat_dma_chan *chan,
+ unsigned int offset)
+{
+ return readb(chan->reg_base + offset);
+}
+
+static inline u16 ioatdma_chan_read16(struct ioat_dma_chan *chan,
+ unsigned int offset)
+{
+ return readw(chan->reg_base + offset);
+}
+
+static inline u32 ioatdma_chan_read32(struct ioat_dma_chan *chan,
+ unsigned int offset)
+{
+ return readl(chan->reg_base + offset);
+}
+
+static inline void ioatdma_chan_write8(struct ioat_dma_chan *chan,
+ unsigned int offset, u8 value)
+{
+ writeb(value, chan->reg_base + offset);
+}
+
+static inline void ioatdma_chan_write16(struct ioat_dma_chan *chan,
+ unsigned int offset, u16 value)
+{
+ writew(value, chan->reg_base + offset);
+}
+
+static inline void ioatdma_chan_write32(struct ioat_dma_chan *chan,
+ unsigned int offset, u32 value)
+{
+ writel(value, chan->reg_base + offset);
+}
+
+#if (BITS_PER_LONG == 64)
+static inline u64 ioatdma_chan_read64(struct ioat_dma_chan *chan,
+ unsigned int offset)
+{
+ return readq(chan->reg_base + offset);
+}
+
+static inline void ioatdma_chan_write64(struct ioat_dma_chan *chan,
+ unsigned int offset, u64 value)
+{
+ writeq(value, chan->reg_base + offset);
+}
+#endif
+
+#endif /* IOATDMA_IO_H */
+
diff --git a/drivers/dma/ioatdma_registers.h b/drivers/dma/ioatdma_registers.h
new file mode 100644
index 0000000..41a21ab
--- /dev/null
+++ b/drivers/dma/ioatdma_registers.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef _IOAT_REGISTERS_H_
+#define _IOAT_REGISTERS_H_
+
+
+/* MMIO Device Registers */
+#define IOAT_CHANCNT_OFFSET 0x00 /* 8-bit */
+
+#define IOAT_XFERCAP_OFFSET 0x01 /* 8-bit */
+#define IOAT_XFERCAP_4KB 12
+#define IOAT_XFERCAP_8KB 13
+#define IOAT_XFERCAP_16KB 14
+#define IOAT_XFERCAP_32KB 15
+#define IOAT_XFERCAP_32GB 0
+
+#define IOAT_GENCTRL_OFFSET 0x02 /* 8-bit */
+#define IOAT_GENCTRL_DEBUG_EN 0x01
+
+#define IOAT_INTRCTRL_OFFSET 0x03 /* 8-bit */
+#define IOAT_INTRCTRL_MASTER_INT_EN 0x01 /* Master Interrupt Enable */
+#define IOAT_INTRCTRL_INT_STATUS 0x02 /* ATTNSTATUS -or- Channel Int */
+#define IOAT_INTRCTRL_INT 0x04 /* INT_STATUS -and- MASTER_INT_EN */
+
+#define IOAT_ATTNSTATUS_OFFSET 0x04 /* Each bit is a channel */
+
+#define IOAT_VER_OFFSET 0x08 /* 8-bit */
+#define IOAT_VER_MAJOR_MASK 0xF0
+#define IOAT_VER_MINOR_MASK 0x0F
+#define GET_IOAT_VER_MAJOR(x) ((x) & IOAT_VER_MAJOR_MASK)
+#define GET_IOAT_VER_MINOR(x) ((x) & IOAT_VER_MINOR_MASK)
+
+#define IOAT_PERPORTOFFSET_OFFSET 0x0A /* 16-bit */
+
+#define IOAT_INTRDELAY_OFFSET 0x0C /* 16-bit */
+#define IOAT_INTRDELAY_INT_DELAY_MASK 0x3FFF /* Interrupt Delay Time */
+#define IOAT_INTRDELAY_COALESE_SUPPORT 0x8000 /* Interrupt Coalesing Supported */
+
+#define IOAT_DEVICE_STATUS_OFFSET 0x0E /* 16-bit */
+#define IOAT_DEVICE_STATUS_DEGRADED_MODE 0x0001
+
+
+#define IOAT_CHANNEL_MMIO_SIZE 0x80 /* Each Channel MMIO space is this size */
+
+/* DMA Channel Registers */
+#define IOAT_CHANCTRL_OFFSET 0x00 /* 16-bit Channel Control Register */
+#define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK 0xF000
+#define IOAT_CHANCTRL_CHANNEL_IN_USE 0x0100
+#define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL 0x0020
+#define IOAT_CHANCTRL_ERR_INT_EN 0x0010
+#define IOAT_CHANCTRL_ANY_ERR_ABORT_EN 0x0008
+#define IOAT_CHANCTRL_ERR_COMPLETION_EN 0x0004
+#define IOAT_CHANCTRL_INT_DISABLE 0x0001
+
+#define IOAT_DMA_COMP_OFFSET 0x02 /* 16-bit DMA channel compatability */
+#define IOAT_DMA_COMP_V1 0x0001 /* Compatability with DMA version 1 */
+
+#define IOAT_CHANSTS_OFFSET 0x04 /* 64-bit Channel Status Register */
+#define IOAT_CHANSTS_OFFSET_LOW 0x04
+#define IOAT_CHANSTS_OFFSET_HIGH 0x08
+#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR 0xFFFFFFFFFFFFFFC0
+#define IOAT_CHANSTS_SOFT_ERR 0x0000000000000010
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS 0x0000000000000007
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE 0x0
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE 0x1
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_SUSPENDED 0x2
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED 0x3
+
+#define IOAT_CHAINADDR_OFFSET 0x0C /* 64-bit Descriptor Chain Address Register */
+#define IOAT_CHAINADDR_OFFSET_LOW 0x0C
+#define IOAT_CHAINADDR_OFFSET_HIGH 0x10
+
+#define IOAT_CHANCMD_OFFSET 0x14 /* 8-bit DMA Channel Command Register */
+#define IOAT_CHANCMD_RESET 0x20
+#define IOAT_CHANCMD_RESUME 0x10
+#define IOAT_CHANCMD_ABORT 0x08
+#define IOAT_CHANCMD_SUSPEND 0x04
+#define IOAT_CHANCMD_APPEND 0x02
+#define IOAT_CHANCMD_START 0x01
+
+#define IOAT_CHANCMP_OFFSET 0x18 /* 64-bit Channel Completion Address Register */
+#define IOAT_CHANCMP_OFFSET_LOW 0x18
+#define IOAT_CHANCMP_OFFSET_HIGH 0x1C
+
+#define IOAT_CDAR_OFFSET 0x20 /* 64-bit Current Descriptor Address Register */
+#define IOAT_CDAR_OFFSET_LOW 0x20
+#define IOAT_CDAR_OFFSET_HIGH 0x24
+
+#define IOAT_CHANERR_OFFSET 0x28 /* 32-bit Channel Error Register */
+#define IOAT_CHANERR_DMA_TRANSFER_SRC_ADDR_ERR 0x0001
+#define IOAT_CHANERR_DMA_TRANSFER_DEST_ADDR_ERR 0x0002
+#define IOAT_CHANERR_NEXT_DESCRIPTOR_ADDR_ERR 0x0004
+#define IOAT_CHANERR_NEXT_DESCRIPTOR_ALIGNMENT_ERR 0x0008
+#define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR 0x0010
+#define IOAT_CHANERR_CHANCMD_ERR 0x0020
+#define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0040
+#define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0080
+#define IOAT_CHANERR_READ_DATA_ERR 0x0100
+#define IOAT_CHANERR_WRITE_DATA_ERR 0x0200
+#define IOAT_CHANERR_DESCRIPTOR_CONTROL_ERR 0x0400
+#define IOAT_CHANERR_DESCRIPTOR_LENGTH_ERR 0x0800
+#define IOAT_CHANERR_COMPLETION_ADDR_ERR 0x1000
+#define IOAT_CHANERR_INT_CONFIGURATION_ERR 0x2000
+#define IOAT_CHANERR_SOFT_ERR 0x4000
+
+#define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit Channel Error Register */
+
+#endif /* _IOAT_REGISTERS_H_ */
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
2006-05-24 0:20 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
2006-05-24 0:20 ` [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 1:13 ` David Miller
2006-05-24 0:20 ` [PATCH 4/9] [I/OAT] Utility functions for offloading sk_buff to iovec copies Chris Leech
` (6 subsequent siblings)
9 siblings, 1 reply; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Attempts to allocate per-CPU DMA channels
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
drivers/dma/Kconfig | 12 +++++
include/linux/netdevice.h | 4 ++
include/net/netdma.h | 38 ++++++++++++++++
net/core/dev.c | 104 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 158 insertions(+), 0 deletions(-)
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0f15e76..30d021d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,6 +10,18 @@ config DMA_ENGINE
DMA engines offload copy operations from the CPU to dedicated
hardware, allowing the copies to happen asynchronously.
+comment "DMA Clients"
+
+config NET_DMA
+ bool "Network: TCP receive copy offload"
+ depends on DMA_ENGINE && NET
+ default y
+ ---help---
+ This enables the use of DMA engines in the network stack to
+ offload receive copy-to-user operations, freeing CPU cycles.
+ Since this is the main user of the DMA engine, it should be enabled;
+ say Y here.
+
comment "DMA Devices"
config INTEL_IOATDMA
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f4169bb..b5760c6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -37,6 +37,7 @@
#include <linux/config.h>
#include <linux/device.h>
#include <linux/percpu.h>
+#include <linux/dmaengine.h>
struct divert_blk;
struct vlan_group;
@@ -593,6 +594,9 @@ struct softnet_data
struct sk_buff *completion_queue;
struct net_device backlog_dev; /* Sorry. 8) */
+#ifdef CONFIG_NET_DMA
+ struct dma_chan *net_dma;
+#endif
};
DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 0000000..cbfe89d
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef NETDMA_H
+#define NETDMA_H
+#include <linux/config.h>
+#ifdef CONFIG_NET_DMA
+#include <linux/dmaengine.h>
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+ struct dma_chan *chan;
+ rcu_read_lock();
+ chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+ if (chan)
+ dma_chan_get(chan);
+ rcu_read_unlock();
+ return chan;
+}
+#endif /* CONFIG_NET_DMA */
+#endif /* NETDMA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 2dce673..6e78798 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
+#include <linux/dmaengine.h>
/*
* The list of packet types we will receive (as opposed to discard)
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock);
static struct list_head ptype_base[16]; /* 16 way hashed list */
static struct list_head ptype_all; /* Taps */
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+static spinlock_t net_dma_event_lock;
+#endif
+
/*
* The @dev_base list is protected by @dev_base_lock and the rtln
* semaphore.
@@ -1844,6 +1851,19 @@ static void net_rx_action(struct softirq
}
}
out:
+#ifdef CONFIG_NET_DMA
+ /*
+ * There may not be any more sk_buffs coming right now, so push
+ * any pending DMA copies to hardware
+ */
+ if (net_dma_client) {
+ struct dma_chan *chan;
+ rcu_read_lock();
+ list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
+ dma_async_memcpy_issue_pending(chan);
+ rcu_read_unlock();
+ }
+#endif
local_irq_enable();
return;
@@ -3298,6 +3318,88 @@ static int dev_cpu_callback(struct notif
}
#endif /* CONFIG_HOTPLUG_CPU */
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes. The net_dma_client tries to have one DMA channel per CPU.
+ */
+static void net_dma_rebalance(void)
+{
+ unsigned int cpu, i, n;
+ struct dma_chan *chan;
+
+ lock_cpu_hotplug();
+
+ if (net_dma_count == 0) {
+ for_each_online_cpu(cpu)
+ rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL);
+ unlock_cpu_hotplug();
+ return;
+ }
+
+ i = 0;
+ cpu = first_cpu(cpu_online_map);
+
+ rcu_read_lock();
+ list_for_each_entry(chan, &net_dma_client->channels, client_node) {
+ n = ((num_online_cpus() / net_dma_count)
+ + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
+
+ while(n) {
+ per_cpu(softnet_data.net_dma, cpu) = chan;
+ cpu = next_cpu(cpu, cpu_online_map);
+ n--;
+ }
+ i++;
+ }
+ rcu_read_unlock();
+
+ unlock_cpu_hotplug();
+}
+
+/**
+ * netdev_dma_event - event callback for the net_dma_client
+ * @client: should always be net_dma_client
+ * @chan:
+ * @event:
+ */
+static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+ enum dma_event event)
+{
+ spin_lock(&net_dma_event_lock);
+ switch (event) {
+ case DMA_RESOURCE_ADDED:
+ net_dma_count++;
+ net_dma_rebalance();
+ break;
+ case DMA_RESOURCE_REMOVED:
+ net_dma_count--;
+ net_dma_rebalance();
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&net_dma_event_lock);
+}
+
+/**
+ * netdev_dma_regiser - register the networking subsystem as a DMA client
+ */
+static int __init netdev_dma_register(void)
+{
+ spin_lock_init(&net_dma_event_lock);
+ net_dma_client = dma_async_client_register(netdev_dma_event);
+ if (net_dma_client == NULL)
+ return -ENOMEM;
+
+ dma_async_client_chan_request(net_dma_client, num_online_cpus());
+ return 0;
+}
+
+#else
+static int __init netdev_dma_register(void) { return -ENODEV; }
+#endif /* CONFIG_NET_DMA */
/*
* Initialize the DEV module. At boot time this walks the device list and
@@ -3351,6 +3453,8 @@ static int __init net_dev_init(void)
atomic_set(&queue->backlog_dev.refcnt, 1);
}
+ netdev_dma_register();
+
dev_boot_phase = 0;
open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 4/9] [I/OAT] Utility functions for offloading sk_buff to iovec copies
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
` (2 preceding siblings ...)
2006-05-24 0:20 ` [PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 0:20 ` [PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT Chris Leech
` (5 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Provides for pinning user space pages in memory, copying to iovecs,
and copying from sk_buffs including fragmented and chained sk_buffs.
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
drivers/dma/Makefile | 3
drivers/dma/iovlock.c | 301 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/dmaengine.h | 22 +++
include/net/netdma.h | 6 +
net/core/Makefile | 1
net/core/user_dma.c | 127 +++++++++++++++++++
6 files changed, 459 insertions(+), 1 deletions(-)
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index c8a5f56..bdcfdbd 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,2 +1,3 @@
-obj-y += dmaengine.o
+obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
+obj-$(CONFIG_NET_DMA) += iovlock.o
obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
new file mode 100644
index 0000000..5ed327e
--- /dev/null
+++ b/drivers/dma/iovlock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/pagemap.h>
+#include <net/tcp.h> /* for memcpy_toiovec */
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+int num_pages_spanned(struct iovec *iov)
+{
+ return
+ ((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
+ ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
+{
+ struct dma_pinned_list *local_list;
+ struct page **pages;
+ int i;
+ int ret;
+ int nr_iovecs = 0;
+ int iovec_len_used = 0;
+ int iovec_pages_used = 0;
+ long err;
+
+ /* don't pin down non-user-based iovecs */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ return NULL;
+
+ /* determine how many iovecs/pages there are, up front */
+ do {
+ iovec_len_used += iov[nr_iovecs].iov_len;
+ iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]);
+ nr_iovecs++;
+ } while (iovec_len_used < len);
+
+ /* single kmalloc for pinned list, page_list[], and the page arrays */
+ local_list = kmalloc(sizeof(*local_list)
+ + (nr_iovecs * sizeof (struct dma_page_list))
+ + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+ if (!local_list) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* list of pages starts right after the page list array */
+ pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+ for (i = 0; i < nr_iovecs; i++) {
+ struct dma_page_list *page_list = &local_list->page_list[i];
+
+ len -= iov[i].iov_len;
+
+ if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+ err = -EFAULT;
+ goto unpin;
+ }
+
+ page_list->nr_pages = num_pages_spanned(&iov[i]);
+ page_list->base_address = iov[i].iov_base;
+
+ page_list->pages = pages;
+ pages += page_list->nr_pages;
+
+ /* pin pages down */
+ down_read(¤t->mm->mmap_sem);
+ ret = get_user_pages(
+ current,
+ current->mm,
+ (unsigned long) iov[i].iov_base,
+ page_list->nr_pages,
+ 1, /* write */
+ 0, /* force */
+ page_list->pages,
+ NULL);
+ up_read(¤t->mm->mmap_sem);
+
+ if (ret != page_list->nr_pages) {
+ err = -ENOMEM;
+ goto unpin;
+ }
+
+ local_list->nr_iovecs = i + 1;
+ }
+
+ return local_list;
+
+unpin:
+ dma_unpin_iovec_pages(local_list);
+out:
+ return ERR_PTR(err);
+}
+
+void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)
+{
+ int i, j;
+
+ if (!pinned_list)
+ return;
+
+ for (i = 0; i < pinned_list->nr_iovecs; i++) {
+ struct dma_page_list *page_list = &pinned_list->page_list[i];
+ for (j = 0; j < page_list->nr_pages; j++) {
+ set_page_dirty_lock(page_list->pages[j]);
+ page_cache_release(page_list->pages[j]);
+ }
+ }
+
+ kfree(pinned_list);
+}
+
+static dma_cookie_t dma_memcpy_to_kernel_iovec(struct dma_chan *chan, struct
+ iovec *iov, unsigned char *kdata, size_t len)
+{
+ dma_cookie_t dma_cookie = 0;
+
+ while (len > 0) {
+ if (iov->iov_len) {
+ int copy = min_t(unsigned int, iov->iov_len, len);
+ dma_cookie = dma_async_memcpy_buf_to_buf(
+ chan,
+ iov->iov_base,
+ kdata,
+ copy);
+ kdata += copy;
+ len -= copy;
+ iov->iov_len -= copy;
+ iov->iov_base += copy;
+ }
+ iov++;
+ }
+
+ return dma_cookie;
+}
+
+/*
+ * We have already pinned down the pages we will be using in the iovecs.
+ * Each entry in iov array has corresponding entry in pinned_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ * by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+ struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len)
+{
+ int iov_byte_offset;
+ int copy;
+ dma_cookie_t dma_cookie = 0;
+ int iovec_idx;
+ int page_idx;
+
+ if (!chan)
+ return memcpy_toiovec(iov, kdata, len);
+
+ /* -> kernel copies (e.g. smbfs) */
+ if (!pinned_list)
+ return dma_memcpy_to_kernel_iovec(chan, iov, kdata, len);
+
+ iovec_idx = 0;
+ while (iovec_idx < pinned_list->nr_iovecs) {
+ struct dma_page_list *page_list;
+
+ /* skip already used-up iovecs */
+ while (!iov[iovec_idx].iov_len)
+ iovec_idx++;
+
+ page_list = &pinned_list->page_list[iovec_idx];
+
+ iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+ page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+ - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+ /* break up copies to not cross page boundary */
+ while (iov[iovec_idx].iov_len) {
+ copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+ copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+ dma_cookie = dma_async_memcpy_buf_to_pg(chan,
+ page_list->pages[page_idx],
+ iov_byte_offset,
+ kdata,
+ copy);
+
+ len -= copy;
+ iov[iovec_idx].iov_len -= copy;
+ iov[iovec_idx].iov_base += copy;
+
+ if (!len)
+ return dma_cookie;
+
+ kdata += copy;
+ iov_byte_offset = 0;
+ page_idx++;
+ }
+ iovec_idx++;
+ }
+
+ /* really bad if we ever run out of iovecs */
+ BUG();
+ return -EFAULT;
+}
+
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+ struct dma_pinned_list *pinned_list, struct page *page,
+ unsigned int offset, size_t len)
+{
+ int iov_byte_offset;
+ int copy;
+ dma_cookie_t dma_cookie = 0;
+ int iovec_idx;
+ int page_idx;
+ int err;
+
+ /* this needs as-yet-unimplemented buf-to-buff, so punt. */
+ /* TODO: use dma for this */
+ if (!chan || !pinned_list) {
+ u8 *vaddr = kmap(page);
+ err = memcpy_toiovec(iov, vaddr + offset, len);
+ kunmap(page);
+ return err;
+ }
+
+ iovec_idx = 0;
+ while (iovec_idx < pinned_list->nr_iovecs) {
+ struct dma_page_list *page_list;
+
+ /* skip already used-up iovecs */
+ while (!iov[iovec_idx].iov_len)
+ iovec_idx++;
+
+ page_list = &pinned_list->page_list[iovec_idx];
+
+ iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+ page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+ - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+ /* break up copies to not cross page boundary */
+ while (iov[iovec_idx].iov_len) {
+ copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+ copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+ dma_cookie = dma_async_memcpy_pg_to_pg(chan,
+ page_list->pages[page_idx],
+ iov_byte_offset,
+ page,
+ offset,
+ copy);
+
+ len -= copy;
+ iov[iovec_idx].iov_len -= copy;
+ iov[iovec_idx].iov_base += copy;
+
+ if (!len)
+ return dma_cookie;
+
+ offset += copy;
+ iov_byte_offset = 0;
+ page_idx++;
+ }
+ iovec_idx++;
+ }
+
+ /* really bad if we ever run out of iovecs */
+ BUG();
+ return -EFAULT;
+}
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3078154..78b236c 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -333,5 +333,27 @@ static inline enum dma_status dma_async_
int dma_async_device_register(struct dma_device *device);
void dma_async_device_unregister(struct dma_device *device);
+/* --- Helper iov-locking functions --- */
+
+struct dma_page_list {
+ char *base_address;
+ int nr_pages;
+ struct page **pages;
+};
+
+struct dma_pinned_list {
+ int nr_iovecs;
+ struct dma_page_list page_list[0];
+};
+
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len);
+void dma_unpin_iovec_pages(struct dma_pinned_list* pinned_list);
+
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+ struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len);
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+ struct dma_pinned_list *pinned_list, struct page *page,
+ unsigned int offset, size_t len);
+
#endif /* CONFIG_DMA_ENGINE */
#endif /* DMAENGINE_H */
diff --git a/include/net/netdma.h b/include/net/netdma.h
index cbfe89d..19760eb 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -23,6 +23,7 @@
#include <linux/config.h>
#ifdef CONFIG_NET_DMA
#include <linux/dmaengine.h>
+#include <linux/skbuff.h>
static inline struct dma_chan *get_softnet_dma(void)
{
@@ -34,5 +35,10 @@ static inline struct dma_chan *get_softn
rcu_read_unlock();
return chan;
}
+
+int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
+ const struct sk_buff *skb, int offset, struct iovec *to,
+ size_t len, struct dma_pinned_list *pinned_list);
+
#endif /* CONFIG_NET_DMA */
#endif /* NETDMA_H */
diff --git a/net/core/Makefile b/net/core/Makefile
index 79fe12c..e9bd246 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_WIRELESS_EXT) += wireless.o
obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NET_DMA) += user_dma.o
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 0000000..9eee91b
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h> /* for BUG_TRAP */
+#include <net/tcp.h>
+
+/**
+ * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ * @skb - buffer to copy
+ * @offset - offset in the buffer to start copying from
+ * @iovec - io vector to copy to
+ * @len - amount of data to copy from buffer to iovec
+ * @pinned_list - locked iovec buffer data
+ *
+ * Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+ struct sk_buff *skb, int offset, struct iovec *to,
+ size_t len, struct dma_pinned_list *pinned_list)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ dma_cookie_t cookie = 0;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
+ skb->data + offset, copy);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ BUG_TRAP(start <= offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ copy = end - offset;
+ if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+
+ if (copy > len)
+ copy = len;
+
+ cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
+ frag->page_offset + offset - start, copy);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ BUG_TRAP(start <= offset + len);
+
+ end = start + list->len;
+ copy = end - offset;
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ cookie = dma_skb_copy_datagram_iovec(chan, list,
+ offset - start, to, copy,
+ pinned_list);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ }
+ start = end;
+ }
+ }
+
+end:
+ if (!len) {
+ skb->dma_cookie = cookie;
+ return cookie;
+ }
+
+fault:
+ return -EFAULT;
+}
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
` (3 preceding siblings ...)
2006-05-24 0:20 ` [PATCH 4/9] [I/OAT] Utility functions for offloading sk_buff to iovec copies Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 0:20 ` [PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Chris Leech
` (4 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Adds an async_wait_queue and some additional fields to tcp_sock, and a
dma_cookie_t to sk_buff.
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
include/linux/skbuff.h | 4 ++++
include/linux/tcp.h | 8 ++++++++
include/net/sock.h | 2 ++
include/net/tcp.h | 7 +++++++
net/core/sock.c | 6 ++++++
5 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f2347..23bad3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,7 @@
#include <linux/net.h>
#include <linux/textsearch.h>
#include <net/checksum.h>
+#include <linux/dmaengine.h>
#define HAVE_ALLOC_SKB /* For the drivers to know */
#define HAVE_ALIGNABLE_SKB /* Ditto 8) */
@@ -285,6 +286,9 @@ struct sk_buff {
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
+#ifdef CONFIG_NET_DMA
+ dma_cookie_t dma_cookie;
+#endif
/* These elements must be at the end, see alloc_skb() for details. */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d395..c90daa5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,6 +18,7 @@
#define _LINUX_TCP_H
#include <linux/types.h>
+#include <linux/dmaengine.h>
#include <asm/byteorder.h>
struct tcphdr {
@@ -233,6 +234,13 @@ struct tcp_sock {
struct iovec *iov;
int memory;
int len;
+#ifdef CONFIG_NET_DMA
+ /* members for async copy */
+ struct dma_chan *dma_chan;
+ int wakeup;
+ struct dma_pinned_list *pinned_list;
+ dma_cookie_t dma_cookie;
+#endif
} ucopy;
__u32 snd_wl1; /* Sequence for window update */
diff --git a/include/net/sock.h b/include/net/sock.h
index c9fad6f..90c65cb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,7 @@ struct sock_common {
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_write_queue: Packet sending queue
+ * @sk_async_wait_queue: DMA copied packets
* @sk_omem_alloc: "o" is "option" or "other"
* @sk_wmem_queued: persistent queue size
* @sk_forward_alloc: space allocated forward
@@ -205,6 +206,7 @@ struct sock {
atomic_t sk_omem_alloc;
struct sk_buff_head sk_receive_queue;
struct sk_buff_head sk_write_queue;
+ struct sk_buff_head sk_async_wait_queue;
int sk_wmem_queued;
int sk_forward_alloc;
gfp_t sk_allocation;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c989db..d0c2c2f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -28,6 +28,7 @@
#include <linux/cache.h>
#include <linux/percpu.h>
#include <linux/skbuff.h>
+#include <linux/dmaengine.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
@@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(str
tp->ucopy.len = 0;
tp->ucopy.memory = 0;
skb_queue_head_init(&tp->ucopy.prequeue);
+#ifdef CONFIG_NET_DMA
+ tp->ucopy.dma_chan = NULL;
+ tp->ucopy.wakeup = 0;
+ tp->ucopy.pinned_list = NULL;
+ tp->ucopy.dma_cookie = 0;
+#endif
}
/* Packet is added to VJ-style prequeue for processing in process
diff --git a/net/core/sock.c b/net/core/sock.c
index ed2afdb..5d820c3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock
atomic_set(&newsk->sk_omem_alloc, 0);
skb_queue_head_init(&newsk->sk_receive_queue);
skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+ skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
rwlock_init(&newsk->sk_dst_lock);
rwlock_init(&newsk->sk_callback_lock);
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock,
skb_queue_head_init(&sk->sk_receive_queue);
skb_queue_head_init(&sk->sk_write_queue);
skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+ skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
sk->sk_send_head = NULL;
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
` (4 preceding siblings ...)
2006-05-24 0:20 ` [PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 0:20 ` [PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware Chris Leech
` (3 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
include/net/tcp.h | 2 ++
net/ipv4/tcp.c | 10 +++++-----
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0c2c2f..578cccf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -294,6 +294,8 @@ extern int tcp_rcv_established(struct
extern void tcp_rcv_space_adjust(struct sock *sk);
+extern void tcp_cleanup_rbuf(struct sock *sk, int copied);
+
extern int tcp_twsk_unique(struct sock *sk,
struct sock *sktw, void *twp);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b80..1c0cfd7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk,
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
int time_to_ack = 0;
@@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_
/* Clean up data we have read: This will do ACK frames. */
if (copied)
- cleanup_rbuf(sk, copied);
+ tcp_cleanup_rbuf(sk, copied);
return copied;
}
@@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
}
}
- cleanup_rbuf(sk, copied);
+ tcp_cleanup_rbuf(sk, copied);
if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
/* Install new reader */
@@ -1391,7 +1391,7 @@ skip_copy:
*/
/* Clean up data we have read: This will do ACK frames. */
- cleanup_rbuf(sk, copied);
+ tcp_cleanup_rbuf(sk, copied);
TCP_CHECK_TIMER(sk);
release_sock(sk);
@@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
inet_csk_ack_scheduled(sk)) {
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
- cleanup_rbuf(sk, 1);
+ tcp_cleanup_rbuf(sk, 1);
if (!(val & 1))
icsk->icsk_ack.pingpong = 1;
}
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
` (5 preceding siblings ...)
2006-05-24 0:20 ` [PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 0:20 ` [PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
` (2 subsequent siblings)
9 siblings, 0 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Add an extra argument to sk_eat_skb, and make it move early copied packets
to the async_wait_queue instead of freeing them.
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
include/net/sock.h | 13 ++++++++++++-
net/dccp/proto.c | 4 ++--
net/ipv4/tcp.c | 8 ++++----
net/llc/af_llc.c | 2 +-
4 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 90c65cb..75b0e97 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg,
* This routine must be called with interrupts disabled or with the socket
* locked so that the sk_buff queue operation is ok.
*/
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
+#ifdef CONFIG_NET_DMA
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
+{
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ if (!copied_early)
+ __kfree_skb(skb);
+ else
+ __skb_queue_tail(&sk->sk_async_wait_queue, skb);
+}
+#else
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
{
__skb_unlink(skb, &sk->sk_receive_queue);
__kfree_skb(skb);
}
+#endif
extern void sock_enable_timestamp(struct sock *sk);
extern int sock_get_timestamp(struct sock *, struct timeval __user *);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2e0ee83..5317fd3 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, str
}
dccp_pr_debug("packet_type=%s\n",
dccp_packet_name(dh->dccph_type));
- sk_eat_skb(sk, skb);
+ sk_eat_skb(sk, skb, 0);
verify_sock_status:
if (sock_flag(sk, SOCK_DONE)) {
len = 0;
@@ -773,7 +773,7 @@ verify_sock_status:
}
found_fin_ok:
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ sk_eat_skb(sk, skb, 0);
break;
} while (1);
out:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1c0cfd7..4e067d2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_
break;
}
if (skb->h.th->fin) {
- sk_eat_skb(sk, skb);
+ sk_eat_skb(sk, skb, 0);
++seq;
break;
}
- sk_eat_skb(sk, skb);
+ sk_eat_skb(sk, skb, 0);
if (!desc->count)
break;
}
@@ -1356,14 +1356,14 @@ skip_copy:
if (skb->h.th->fin)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ sk_eat_skb(sk, skb, 0);
continue;
found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ sk_eat_skb(sk, skb, 0);
break;
} while (len > 0);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5a04db7..7465170 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *
continue;
if (!(flags & MSG_PEEK)) {
- sk_eat_skb(sk, skb);
+ sk_eat_skb(sk, skb, 0);
*seq = 0;
}
} while (len > 0);
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
` (6 preceding siblings ...)
2006-05-24 0:20 ` [PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 1:02 ` Andrew Morton
2006-05-24 0:20 ` [PATCH 9/9] [I/OAT] TCP recv offload to I/OAT Chris Leech
2006-05-24 0:22 ` [PATCH 0/9] I/OAT repost David Miller
9 siblings, 1 reply; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Any socket recv of less than this ammount will not be offloaded
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
include/linux/sysctl.h | 1 +
include/net/tcp.h | 1 +
net/core/user_dma.c | 4 ++++
net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
4 files changed, 16 insertions(+), 0 deletions(-)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 76eaeff..cd9e7c0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -403,6 +403,7 @@ enum
NET_TCP_MTU_PROBING=113,
NET_TCP_BASE_MSS=114,
NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
+ NET_TCP_DMA_COPYBREAK=116,
};
enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 578cccf..f1f4727 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_tw_reuse;
extern int sysctl_tcp_frto;
extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_dma_copybreak;
extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 9eee91b..b7c98db 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -30,6 +30,10 @@
#include <linux/rtnetlink.h> /* for BUG_TRAP */
#include <net/tcp.h>
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+
/**
* dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
* @skb - buffer to copy
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3ad..6a6aa53 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -688,6 +688,16 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec
},
+#ifdef CONFIG_NET_DMA
+ {
+ .ctl_name = NET_TCP_DMA_COPYBREAK,
+ .procname = "tcp_dma_copybreak",
+ .data = &sysctl_tcp_dma_copybreak,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+#endif
{ .ctl_name = 0 }
};
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH 9/9] [I/OAT] TCP recv offload to I/OAT
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
` (7 preceding siblings ...)
2006-05-24 0:20 ` [PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
@ 2006-05-24 0:20 ` Chris Leech
2006-05-24 0:22 ` [PATCH 0/9] I/OAT repost David Miller
9 siblings, 0 replies; 30+ messages in thread
From: Chris Leech @ 2006-05-24 0:20 UTC (permalink / raw)
To: linux-kernel, netdev
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
dma_async_try_early_copy in tcp_v4_do_rcv
Signed-off-by: Chris Leech <christopher.leech@intel.com>
---
net/ipv4/tcp.c | 103 ++++++++++++++++++++++++++++++++++++++++++++------
net/ipv4/tcp_input.c | 74 +++++++++++++++++++++++++++++++++---
net/ipv4/tcp_ipv4.c | 18 ++++++++-
net/ipv6/tcp_ipv6.c | 12 +++++-
4 files changed, 185 insertions(+), 22 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4e067d2..ff6ccda 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -263,7 +263,7 @@
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
-
+#include <net/netdma.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -1110,6 +1110,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
+ int copied_early = 0;
lock_sock(sk);
@@ -1133,6 +1134,17 @@ int tcp_recvmsg(struct kiocb *iocb, stru
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+#ifdef CONFIG_NET_DMA
+ tp->ucopy.dma_chan = NULL;
+ preempt_disable();
+ if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
+ !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) {
+ preempt_enable_no_resched();
+ tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
+ } else
+ preempt_enable_no_resched();
+#endif
+
do {
struct sk_buff *skb;
u32 offset;
@@ -1274,6 +1286,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru
} else
sk_wait_data(sk, &timeo);
+#ifdef CONFIG_NET_DMA
+ tp->ucopy.wakeup = 0;
+#endif
+
if (user_recv) {
int chunk;
@@ -1329,13 +1345,39 @@ do_prequeue:
}
if (!(flags & MSG_TRUNC)) {
- err = skb_copy_datagram_iovec(skb, offset,
- msg->msg_iov, used);
- if (err) {
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
- break;
+#ifdef CONFIG_NET_DMA
+ if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+ tp->ucopy.dma_chan = get_softnet_dma();
+
+ if (tp->ucopy.dma_chan) {
+ tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+ tp->ucopy.dma_chan, skb, offset,
+ msg->msg_iov, used,
+ tp->ucopy.pinned_list);
+
+ if (tp->ucopy.dma_cookie < 0) {
+
+ printk(KERN_ALERT "dma_cookie < 0\n");
+
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ if ((offset + used) == skb->len)
+ copied_early = 1;
+
+ } else
+#endif
+ {
+ err = skb_copy_datagram_iovec(skb, offset,
+ msg->msg_iov, used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
}
}
@@ -1355,15 +1397,19 @@ skip_copy:
if (skb->h.th->fin)
goto found_fin_ok;
- if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb, 0);
+ if (!(flags & MSG_PEEK)) {
+ sk_eat_skb(sk, skb, copied_early);
+ copied_early = 0;
+ }
continue;
found_fin_ok:
/* Process the FIN. */
++*seq;
- if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb, 0);
+ if (!(flags & MSG_PEEK)) {
+ sk_eat_skb(sk, skb, copied_early);
+ copied_early = 0;
+ }
break;
} while (len > 0);
@@ -1386,6 +1432,36 @@ skip_copy:
tp->ucopy.len = 0;
}
+#ifdef CONFIG_NET_DMA
+ if (tp->ucopy.dma_chan) {
+ struct sk_buff *skb;
+ dma_cookie_t done, used;
+
+ dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+ while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+ tp->ucopy.dma_cookie, &done,
+ &used) == DMA_IN_PROGRESS) {
+ /* do partial cleanup of sk_async_wait_queue */
+ while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
+ (dma_async_is_complete(skb->dma_cookie, done,
+ used) == DMA_SUCCESS)) {
+ __skb_dequeue(&sk->sk_async_wait_queue);
+ kfree_skb(skb);
+ }
+ }
+
+ /* Safe to free early-copied skbs now */
+ __skb_queue_purge(&sk->sk_async_wait_queue);
+ dma_chan_put(tp->ucopy.dma_chan);
+ tp->ucopy.dma_chan = NULL;
+ }
+ if (tp->ucopy.pinned_list) {
+ dma_unpin_iovec_pages(tp->ucopy.pinned_list);
+ tp->ucopy.pinned_list = NULL;
+ }
+#endif
+
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
@@ -1658,6 +1734,9 @@ int tcp_disconnect(struct sock *sk, int
__skb_queue_purge(&sk->sk_receive_queue);
sk_stream_writequeue_purge(sk);
__skb_queue_purge(&tp->out_of_order_queue);
+#ifdef CONFIG_NET_DMA
+ __skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
inet->dport = 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4a538bc..07826c4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -71,6 +71,7 @@
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
+#include <net/netdma.h>
int sysctl_tcp_timestamps = 1;
int sysctl_tcp_window_scaling = 1;
@@ -3787,6 +3788,50 @@ static inline int tcp_checksum_complete_
__tcp_checksum_complete_user(sk, skb);
}
+#ifdef CONFIG_NET_DMA
+static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int chunk = skb->len - hlen;
+ int dma_cookie;
+ int copied_early = 0;
+
+ if (tp->ucopy.wakeup)
+ return 0;
+
+ if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+ tp->ucopy.dma_chan = get_softnet_dma();
+
+ if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
+
+ dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
+ skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
+
+ if (dma_cookie < 0)
+ goto out;
+
+ tp->ucopy.dma_cookie = dma_cookie;
+ copied_early = 1;
+
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ tcp_rcv_space_adjust(sk);
+
+ if ((tp->ucopy.len == 0) ||
+ (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
+ (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
+ tp->ucopy.wakeup = 1;
+ sk->sk_data_ready(sk, 0);
+ }
+ } else if (chunk > 0) {
+ tp->ucopy.wakeup = 1;
+ sk->sk_data_ready(sk, 0);
+ }
+out:
+ return copied_early;
+}
+#endif /* CONFIG_NET_DMA */
+
/*
* TCP receive function for the ESTABLISHED state.
*
@@ -3903,14 +3948,23 @@ int tcp_rcv_established(struct sock *sk,
}
} else {
int eaten = 0;
+ int copied_early = 0;
- if (tp->ucopy.task == current &&
- tp->copied_seq == tp->rcv_nxt &&
- len - tcp_header_len <= tp->ucopy.len &&
- sock_owned_by_user(sk)) {
- __set_current_state(TASK_RUNNING);
+ if (tp->copied_seq == tp->rcv_nxt &&
+ len - tcp_header_len <= tp->ucopy.len) {
+#ifdef CONFIG_NET_DMA
+ if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
+ copied_early = 1;
+ eaten = 1;
+ }
+#endif
+ if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
+ __set_current_state(TASK_RUNNING);
- if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+ if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
+ eaten = 1;
+ }
+ if (eaten) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
@@ -3926,8 +3980,9 @@ int tcp_rcv_established(struct sock *sk,
__skb_pull(skb, tcp_header_len);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
- eaten = 1;
}
+ if (copied_early)
+ tcp_cleanup_rbuf(sk, skb->len);
}
if (!eaten) {
if (tcp_checksum_complete_user(sk, skb))
@@ -3968,6 +4023,11 @@ int tcp_rcv_established(struct sock *sk,
__tcp_ack_snd_check(sk, 0);
no_ack:
+#ifdef CONFIG_NET_DMA
+ if (copied_early)
+ __skb_queue_tail(&sk->sk_async_wait_queue, skb);
+ else
+#endif
if (eaten)
__kfree_skb(skb);
else
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 672950e..25ecc6e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -71,6 +71,7 @@
#include <net/inet_common.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
+#include <net/netdma.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
@@ -1091,8 +1092,18 @@ process:
bh_lock_sock(sk);
ret = 0;
if (!sock_owned_by_user(sk)) {
- if (!tcp_prequeue(sk, skb))
+#ifdef CONFIG_NET_DMA
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+ tp->ucopy.dma_chan = get_softnet_dma();
+ if (tp->ucopy.dma_chan)
ret = tcp_v4_do_rcv(sk, skb);
+ else
+#endif
+ {
+ if (!tcp_prequeue(sk, skb))
+ ret = tcp_v4_do_rcv(sk, skb);
+ }
} else
sk_add_backlog(sk, skb);
bh_unlock_sock(sk);
@@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk)
/* Cleans up our, hopefully empty, out_of_order_queue. */
__skb_queue_purge(&tp->out_of_order_queue);
+#ifdef CONFIG_NET_DMA
+ /* Cleans up our sk_async_wait_queue */
+ __skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
+
/* Clean prequeue, it must be empty really */
__skb_queue_purge(&tp->ucopy.prequeue);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 301eee7..a50eb30 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1218,8 +1218,16 @@ process:
bh_lock_sock(sk);
ret = 0;
if (!sock_owned_by_user(sk)) {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v6_do_rcv(sk, skb);
+#ifdef CONFIG_NET_DMA
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tp->ucopy.dma_chan)
+ ret = tcp_v6_do_rcv(sk, skb);
+ else
+#endif
+ {
+ if (!tcp_prequeue(sk, skb))
+ ret = tcp_v6_do_rcv(sk, skb);
+ }
} else
sk_add_backlog(sk, skb);
bh_unlock_sock(sk);
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH 0/9] I/OAT repost
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
` (8 preceding siblings ...)
2006-05-24 0:20 ` [PATCH 9/9] [I/OAT] TCP recv offload to I/OAT Chris Leech
@ 2006-05-24 0:22 ` David Miller
9 siblings, 0 replies; 30+ messages in thread
From: David Miller @ 2006-05-24 0:22 UTC (permalink / raw)
To: christopher.leech; +Cc: linux-kernel, netdev
From: Chris Leech <christopher.leech@intel.com>
Subject: [PATCH 0/9] I/OAT repost
Date: Tue, 23 May 2006 17:16:53 -0700
> This is a repost of the I/OAT patches, the only changes from last time
> are refreshing the patches and removing an unused macro that was causing
> the vger spam filters to drop patch 2/9.
>
> This patch series is the a full release of the Intel(R) I/O
> Acceleration Technology (I/OAT) for Linux. It includes an in kernel API
> for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
> engine, and changes to the TCP stack to offload copies of received
> networking data to application space.
I'm going to apply this into a net-2.6.18 GIT tree, do some build
and sanity checking, then ask Andrew to pull it into -mm for testing.
Thanks guys.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-05-24 0:20 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
@ 2006-05-24 0:48 ` Andrew Morton
2006-05-24 6:31 ` Nathan Lynch
2006-05-24 0:51 ` Andrew Morton
2006-05-25 17:59 ` Olof Johansson
2 siblings, 1 reply; 30+ messages in thread
From: Andrew Morton @ 2006-05-24 0:48 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
Chris Leech <christopher.leech@intel.com> wrote:
>
> + for_each_cpu(i)
That's about to be deleted. Please use for_each_possible_cpu().
That's if for_each_possible_cpu() is appropriate. Perhaps it should be
using for_each_present_cpu(), or for_each_online_cpu(). That's why
for_each_cpu() is going away - to make people think about such things..
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-05-24 0:20 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
2006-05-24 0:48 ` Andrew Morton
@ 2006-05-24 0:51 ` Andrew Morton
2006-05-25 17:59 ` Olof Johansson
2 siblings, 0 replies; 30+ messages in thread
From: Andrew Morton @ 2006-05-24 0:51 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
Chris Leech <christopher.leech@intel.com> wrote:
>
> +/**
> + * dma_client_chan_free - release a DMA channel
> + * @chan: &dma_chan
> + */
> +void dma_chan_cleanup(struct kref *kref)
> +{
> + struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
> + chan->device->device_free_chan_resources(chan);
> + chan->client = NULL;
> + kref_put(&chan->device->refcount, dma_async_device_cleanup);
> +}
> +
> +static void dma_chan_free_rcu(struct rcu_head *rcu)
> +{
> + struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
> + int bias = 0x7FFFFFFF;
> + int i;
> + for_each_cpu(i)
> + bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
> + atomic_sub(bias, &chan->refcount.refcount);
> + kref_put(&chan->refcount, dma_chan_cleanup);
> +}
> +
> +static void dma_client_chan_free(struct dma_chan *chan)
> +{
> + atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
> + chan->slow_ref = 1;
> + call_rcu(&chan->rcu, dma_chan_free_rcu);
> +}
A comment describing this `bias' magic would be nice.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine
2006-05-24 0:20 ` [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine Chris Leech
@ 2006-05-24 0:56 ` Andrew Morton
2006-05-25 18:00 ` Olof Johansson
2006-05-30 23:27 ` Adrian Bunk
2 siblings, 0 replies; 30+ messages in thread
From: Andrew Morton @ 2006-05-24 0:56 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
Chris Leech <christopher.leech@intel.com> wrote:
>
> +static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
> +{
> + unsigned long phys_complete;
> + struct ioat_desc_sw *desc, *_desc;
> + dma_cookie_t cookie = 0;
> +
> + prefetch(chan->completion_virt);
> +
> + if (!spin_trylock(&chan->cleanup_lock))
> + return;
> +
spin_trylock() is a red flag. It often means that someone screwed their
locking up.
It at least needs a comment explaining its presence.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold
2006-05-24 0:20 ` [PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
@ 2006-05-24 1:02 ` Andrew Morton
0 siblings, 0 replies; 30+ messages in thread
From: Andrew Morton @ 2006-05-24 1:02 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
Chris Leech <christopher.leech@intel.com> wrote:
>
> Any socket recv of less than this ammount will not be offloaded
>
> Signed-off-by: Chris Leech <christopher.leech@intel.com>
> ---
>
> include/linux/sysctl.h | 1 +
> include/net/tcp.h | 1 +
> net/core/user_dma.c | 4 ++++
> net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
> 4 files changed, 16 insertions(+), 0 deletions(-)
>
Documentation/networking/ip-sysctl.txt too, please.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client
2006-05-24 0:20 ` [PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client Chris Leech
@ 2006-05-24 1:13 ` David Miller
0 siblings, 0 replies; 30+ messages in thread
From: David Miller @ 2006-05-24 1:13 UTC (permalink / raw)
To: christopher.leech; +Cc: linux-kernel, netdev
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:20:15 -0700
> +static void net_dma_rebalance(void)
> +{
> + unsigned int cpu, i, n;
> + struct dma_chan *chan;
> +
> + lock_cpu_hotplug();
You can't call lock_cpu_hotplug(), because that sleeps and takes
semaphores and we currently hold a spinlock taken here:
> +static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
> + enum dma_event event)
> +{
> + spin_lock(&net_dma_event_lock);
> + switch (event) {
> + case DMA_RESOURCE_ADDED:
> + net_dma_count++;
> + net_dma_rebalance();
> + break;
You'll need to run this DMA rebalancing asynchronously in process
context via keventd or similar to deal with this locking bug.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-05-24 0:48 ` Andrew Morton
@ 2006-05-24 6:31 ` Nathan Lynch
0 siblings, 0 replies; 30+ messages in thread
From: Nathan Lynch @ 2006-05-24 6:31 UTC (permalink / raw)
To: Andrew Morton; +Cc: Chris Leech, linux-kernel, netdev
Andrew Morton wrote:
> Chris Leech <christopher.leech@intel.com> wrote:
> >
> > + for_each_cpu(i)
>
> That's about to be deleted. Please use for_each_possible_cpu().
>
> That's if for_each_possible_cpu() is appropriate.
It is -- those loops traverse chan->local, which is alloc_percpu'd,
which allocates for all possible cpus.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-05-24 0:20 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
2006-05-24 0:48 ` Andrew Morton
2006-05-24 0:51 ` Andrew Morton
@ 2006-05-25 17:59 ` Olof Johansson
2006-05-25 18:09 ` Olof Johansson
2 siblings, 1 reply; 30+ messages in thread
From: Olof Johansson @ 2006-05-25 17:59 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
Hi,
On Tue, May 23, 2006 at 05:20:12PM -0700, Chris Leech wrote:
> +EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
> +EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
> +EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
Is there a specific reason for why you chose to export 3 different
memcpu calls? They're all just wrapped to the same internals.
It would seem to make sense to have the client do their own
page_address(page) + offset calculations and just export one function?
-Olof
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine
2006-05-24 0:20 ` [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine Chris Leech
2006-05-24 0:56 ` Andrew Morton
@ 2006-05-25 18:00 ` Olof Johansson
2006-05-30 23:27 ` Adrian Bunk
2 siblings, 0 replies; 30+ messages in thread
From: Olof Johansson @ 2006-05-25 18:00 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
Hi,
Minor nitpick below:
On Tue, May 23, 2006 at 05:20:13PM -0700, Chris Leech wrote:
> +static int enumerate_dma_channels(struct ioat_device *device)
[...]
> + enumerate_dma_channels(device);
Return value is never used, might as well change the function
declaration to void.
-Olof
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/9] [I/OAT] DMA memcpy subsystem
2006-05-25 17:59 ` Olof Johansson
@ 2006-05-25 18:09 ` Olof Johansson
0 siblings, 0 replies; 30+ messages in thread
From: Olof Johansson @ 2006-05-25 18:09 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
On Thu, May 25, 2006 at 10:59:40AM -0700, Olof Johansson wrote:
> Is there a specific reason for why you chose to export 3 different
> memcpu calls? They're all just wrapped to the same internals.
>
> It would seem to make sense to have the client do their own
> page_address(page) + offset calculations and just export one function?
Nevermind. I'm too used to 64-bit environments where all memory is
always addressable to the kernel. There's obvious reasons to do it on
32-bit platforms to avoid the extra kernel mapping.
-Olof
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine
2006-05-24 0:20 ` [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine Chris Leech
2006-05-24 0:56 ` Andrew Morton
2006-05-25 18:00 ` Olof Johansson
@ 2006-05-30 23:27 ` Adrian Bunk
2 siblings, 0 replies; 30+ messages in thread
From: Adrian Bunk @ 2006-05-30 23:27 UTC (permalink / raw)
To: Chris Leech; +Cc: linux-kernel, netdev
On Tue, May 23, 2006 at 05:20:13PM -0700, Chris Leech wrote:
> Adds a new ioatdma driver
>
> Signed-off-by: Chris Leech <christopher.leech@intel.com>
> ---
>
> drivers/dma/Kconfig | 9
> drivers/dma/Makefile | 1
> drivers/dma/ioatdma.c | 839 +++++++++++++++++++++++++++++++++++++++
> drivers/dma/ioatdma.h | 126 ++++++
> drivers/dma/ioatdma_hw.h | 52 ++
> drivers/dma/ioatdma_io.h | 118 +++++
> drivers/dma/ioatdma_registers.h | 126 ++++++
> 7 files changed, 1271 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
> index f9ac4bc..0f15e76 100644
> --- a/drivers/dma/Kconfig
> +++ b/drivers/dma/Kconfig
> @@ -10,4 +10,13 @@ config DMA_ENGINE
> DMA engines offload copy operations from the CPU to dedicated
> hardware, allowing the copies to happen asynchronously.
>
> +comment "DMA Devices"
> +
> +config INTEL_IOATDMA
> + tristate "Intel I/OAT DMA support"
> + depends on DMA_ENGINE && PCI
> + default m
> + ---help---
> + Enable support for the Intel(R) I/OAT DMA engine.
> +
> endmenu
>...
- please don't use "default m"
- can you enhance the help text?
e.g. you could list which hardware contains this DMA engine
cu
Adrian
--
"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed
^ permalink raw reply [flat|nested] 30+ messages in thread
end of thread, other threads:[~2006-05-30 23:27 UTC | newest]
Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-05-24 0:16 [PATCH 0/9] I/OAT repost Chris Leech
2006-05-24 0:20 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
2006-05-24 0:48 ` Andrew Morton
2006-05-24 6:31 ` Nathan Lynch
2006-05-24 0:51 ` Andrew Morton
2006-05-25 17:59 ` Olof Johansson
2006-05-25 18:09 ` Olof Johansson
2006-05-24 0:20 ` [PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine Chris Leech
2006-05-24 0:56 ` Andrew Morton
2006-05-25 18:00 ` Olof Johansson
2006-05-30 23:27 ` Adrian Bunk
2006-05-24 0:20 ` [PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client Chris Leech
2006-05-24 1:13 ` David Miller
2006-05-24 0:20 ` [PATCH 4/9] [I/OAT] Utility functions for offloading sk_buff to iovec copies Chris Leech
2006-05-24 0:20 ` [PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT Chris Leech
2006-05-24 0:20 ` [PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Chris Leech
2006-05-24 0:20 ` [PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware Chris Leech
2006-05-24 0:20 ` [PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold Chris Leech
2006-05-24 1:02 ` Andrew Morton
2006-05-24 0:20 ` [PATCH 9/9] [I/OAT] TCP recv offload to I/OAT Chris Leech
2006-05-24 0:22 ` [PATCH 0/9] I/OAT repost David Miller
-- strict thread matches above, loose matches on Subject: below --
2006-05-08 22:16 [PATCH 0/9] I/OAT network recv copy offload Chris Leech
2006-05-08 22:17 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
2006-03-29 22:55 [PATCH 0/9] I/OAT Chris Leech
2006-03-29 22:55 ` [PATCH 1/9] [I/OAT] DMA memcpy subsystem Chris Leech
2006-03-30 8:01 ` Kumar Gala
2006-03-30 18:36 ` Andrew Grover
2006-03-30 19:57 ` Kumar Gala
2006-03-31 8:26 ` Ingo Oeser
2006-03-31 20:04 ` Andrew Grover
2006-03-31 20:06 ` Kumar Gala
2006-03-31 20:27 ` Andrew Grover
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).