LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 4/8] v2 Allow memory block to span multiple memory sections
From: Nathan Fontenot @ 2010-09-27 19:25 UTC (permalink / raw)
  To: linux-kernel, linux-mm, linuxppc-dev
  Cc: Greg KH, KAMEZAWA Hiroyuki, Dave Hansen
In-Reply-To: <4CA0EBEB.1030204@austin.ibm.com>

Update the memory sysfs code such that each sysfs memory directory is now
considered a memory block that can span multiple memory sections per
memory block.  The default size of each memory block is SECTION_SIZE_BITS
to maintain the current behavior of having a single memory section per
memory block (i.e. one sysfs directory per memory section).

For architectures that want to have memory blocks span multiple
memory sections they need only define their own memory_block_size_bytes()
routine.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>

---
 drivers/base/memory.c |  155 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 108 insertions(+), 47 deletions(-)

Index: linux-next/drivers/base/memory.c
===================================================================
--- linux-next.orig/drivers/base/memory.c	2010-09-27 09:31:57.000000000 -0500
+++ linux-next/drivers/base/memory.c	2010-09-27 13:50:18.000000000 -0500
@@ -30,6 +30,14 @@
 static DEFINE_MUTEX(mem_sysfs_mutex);
 
 #define MEMORY_CLASS_NAME	"memory"
+#define MIN_MEMORY_BLOCK_SIZE	(1 << SECTION_SIZE_BITS)
+
+static int sections_per_block;
+
+static inline int base_memory_block_id(int section_nr)
+{
+	return section_nr / sections_per_block;
+}
 
 static struct sysdev_class memory_sysdev_class = {
 	.name = MEMORY_CLASS_NAME,
@@ -84,28 +92,47 @@
  * register_memory - Setup a sysfs device for a memory block
  */
 static
-int register_memory(struct memory_block *memory, struct mem_section *section)
+int register_memory(struct memory_block *memory)
 {
 	int error;
 
 	memory->sysdev.cls = &memory_sysdev_class;
-	memory->sysdev.id = __section_nr(section);
+	memory->sysdev.id = memory->phys_index / sections_per_block;
 
 	error = sysdev_register(&memory->sysdev);
 	return error;
 }
 
 static void
-unregister_memory(struct memory_block *memory, struct mem_section *section)
+unregister_memory(struct memory_block *memory)
 {
 	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
-	BUG_ON(memory->sysdev.id != __section_nr(section));
 
 	/* drop the ref. we got in remove_memory_block() */
 	kobject_put(&memory->sysdev.kobj);
 	sysdev_unregister(&memory->sysdev);
 }
 
+u32 __weak memory_block_size_bytes(void)
+{
+	return MIN_MEMORY_BLOCK_SIZE;
+}
+
+static u32 get_memory_block_size(void)
+{
+	u32 block_sz;
+
+	block_sz = memory_block_size_bytes();
+
+	/* Validate blk_sz is a power of 2 and not less than section size */
+	if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
+		WARN_ON(1);
+		block_sz = MIN_MEMORY_BLOCK_SIZE;
+	}
+
+	return block_sz;
+}
+
 /*
  * use this as the physical section index that this memsection
  * uses.
@@ -116,7 +143,7 @@
 {
 	struct memory_block *mem =
 		container_of(dev, struct memory_block, sysdev);
-	return sprintf(buf, "%08lx\n", mem->phys_index);
+	return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
 }
 
 /*
@@ -125,13 +152,16 @@
 static ssize_t show_mem_removable(struct sys_device *dev,
 			struct sysdev_attribute *attr, char *buf)
 {
-	unsigned long start_pfn;
-	int ret;
+	unsigned long i, pfn;
+	int ret = 1;
 	struct memory_block *mem =
 		container_of(dev, struct memory_block, sysdev);
 
-	start_pfn = section_nr_to_pfn(mem->phys_index);
-	ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION);
+	for (i = 0; i < sections_per_block; i++) {
+		pfn = section_nr_to_pfn(mem->phys_index + i);
+		ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
+	}
+
 	return sprintf(buf, "%d\n", ret);
 }
 
@@ -184,17 +214,14 @@
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(struct memory_block *mem, unsigned long action)
+memory_section_action(unsigned long phys_index, unsigned long action)
 {
 	int i;
-	unsigned long psection;
 	unsigned long start_pfn, start_paddr;
 	struct page *first_page;
 	int ret;
-	int old_state = mem->state;
 
-	psection = mem->phys_index;
-	first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+	first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
 
 	/*
 	 * The probe routines leave the pages reserved, just
@@ -207,8 +234,8 @@
 				continue;
 
 			printk(KERN_WARNING "section number %ld page number %d "
-				"not reserved, was it already online? \n",
-				psection, i);
+				"not reserved, was it already online?\n",
+				phys_index, i);
 			return -EBUSY;
 		}
 	}
@@ -219,18 +246,13 @@
 			ret = online_pages(start_pfn, PAGES_PER_SECTION);
 			break;
 		case MEM_OFFLINE:
-			mem->state = MEM_GOING_OFFLINE;
 			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
 			ret = remove_memory(start_paddr,
 					    PAGES_PER_SECTION << PAGE_SHIFT);
-			if (ret) {
-				mem->state = old_state;
-				break;
-			}
 			break;
 		default:
-			WARN(1, KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
-					__func__, mem, action, action);
+			WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
+			     "%ld\n", __func__, phys_index, action, action);
 			ret = -EINVAL;
 	}
 
@@ -240,7 +262,8 @@
 static int memory_block_change_state(struct memory_block *mem,
 		unsigned long to_state, unsigned long from_state_req)
 {
-	int ret = 0;
+	int i, ret = 0;
+
 	mutex_lock(&mem->state_mutex);
 
 	if (mem->state != from_state_req) {
@@ -248,8 +271,22 @@
 		goto out;
 	}
 
-	ret = memory_block_action(mem, to_state);
-	if (!ret)
+	if (to_state == MEM_OFFLINE)
+		mem->state = MEM_GOING_OFFLINE;
+
+	for (i = 0; i < sections_per_block; i++) {
+		ret = memory_section_action(mem->phys_index + i, to_state);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		for (i = 0; i < sections_per_block; i++)
+			memory_section_action(mem->phys_index + i,
+					      from_state_req);
+
+		mem->state = from_state_req;
+	} else
 		mem->state = to_state;
 
 out:
@@ -262,20 +299,15 @@
 		struct sysdev_attribute *attr, const char *buf, size_t count)
 {
 	struct memory_block *mem;
-	unsigned int phys_section_nr;
 	int ret = -EINVAL;
 
 	mem = container_of(dev, struct memory_block, sysdev);
-	phys_section_nr = mem->phys_index;
-
-	if (!present_section_nr(phys_section_nr))
-		goto out;
 
 	if (!strncmp(buf, "online", min((int)count, 6)))
 		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 	else if(!strncmp(buf, "offline", min((int)count, 7)))
 		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
-out:
+
 	if (ret)
 		return ret;
 	return count;
@@ -315,7 +347,7 @@
 print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr,
 		 char *buf)
 {
-	return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
+	return sprintf(buf, "%x\n", get_memory_block_size());
 }
 
 static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
@@ -451,12 +483,13 @@
 	struct sys_device *sysdev;
 	struct memory_block *mem;
 	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
+	int block_id = base_memory_block_id(__section_nr(section));
 
 	/*
 	 * This only works because we know that section == sysdev->id
 	 * slightly redundant with sysdev_register()
 	 */
-	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
+	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id);
 
 	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
 	if (!kobj)
@@ -468,26 +501,27 @@
 	return mem;
 }
 
-static int add_memory_block(int nid, struct mem_section *section,
-			unsigned long state, enum mem_add_context context)
+static int init_memory_block(struct memory_block **memory,
+			     struct mem_section *section, unsigned long state)
 {
-	struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	struct memory_block *mem;
 	unsigned long start_pfn;
+	int scn_nr;
 	int ret = 0;
 
+	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 	if (!mem)
 		return -ENOMEM;
 
-	mutex_lock(&mem_sysfs_mutex);
-
-	mem->phys_index = __section_nr(section);
+	scn_nr = __section_nr(section);
+	mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block;
 	mem->state = state;
 	atomic_inc(&mem->section_count);
 	mutex_init(&mem->state_mutex);
 	start_pfn = section_nr_to_pfn(mem->phys_index);
 	mem->phys_device = arch_get_memory_phys_device(start_pfn);
 
-	ret = register_memory(mem, section);
+	ret = register_memory(mem);
 	if (!ret)
 		ret = mem_create_simple_file(mem, phys_index);
 	if (!ret)
@@ -496,8 +530,29 @@
 		ret = mem_create_simple_file(mem, phys_device);
 	if (!ret)
 		ret = mem_create_simple_file(mem, removable);
+
+	*memory = mem;
+	return ret;
+}
+
+static int add_memory_section(int nid, struct mem_section *section,
+			unsigned long state, enum mem_add_context context)
+{
+	struct memory_block *mem;
+	int ret = 0;
+
+	mutex_lock(&mem_sysfs_mutex);
+
+	mem = find_memory_block(section);
+	if (mem) {
+		atomic_inc(&mem->section_count);
+		kobject_put(&mem->sysdev.kobj);
+	} else
+		ret = init_memory_block(&mem, section, state);
+
 	if (!ret) {
-		if (context == HOTPLUG)
+		if (context == HOTPLUG &&
+		    atomic_read(&mem->section_count) == sections_per_block)
 			ret = register_mem_sect_under_node(mem, nid);
 	}
 
@@ -519,8 +574,10 @@
 		mem_remove_simple_file(mem, state);
 		mem_remove_simple_file(mem, phys_device);
 		mem_remove_simple_file(mem, removable);
-		unregister_memory(mem, section);
-	}
+		unregister_memory(mem);
+		kfree(mem);
+	} else
+		kobject_put(&mem->sysdev.kobj);
 
 	mutex_unlock(&mem_sysfs_mutex);
 	return 0;
@@ -532,7 +589,7 @@
  */
 int register_new_memory(int nid, struct mem_section *section)
 {
-	return add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG);
+	return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG);
 }
 
 int unregister_memory_section(struct mem_section *section)
@@ -551,12 +608,16 @@
 	unsigned int i;
 	int ret;
 	int err;
+	int block_sz;
 
 	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
 	ret = sysdev_class_register(&memory_sysdev_class);
 	if (ret)
 		goto out;
 
+	block_sz = get_memory_block_size();
+	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+
 	/*
 	 * Create entries for memory sections that were found
 	 * during boot and have been initialized
@@ -564,8 +625,8 @@
 	for (i = 0; i < NR_MEM_SECTIONS; i++) {
 		if (!present_section_nr(i))
 			continue;
-		err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE,
-				       BOOT);
+		err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE,
+					 BOOT);
 		if (!ret)
 			ret = err;
 	}

^ permalink raw reply

* [PATCH 3/8] v2 Add mutex for adding/removing memory blocks
From: Nathan Fontenot @ 2010-09-27 19:23 UTC (permalink / raw)
  To: linux-kernel, linux-mm, linuxppc-dev
  Cc: Greg KH, KAMEZAWA Hiroyuki, Dave Hansen
In-Reply-To: <4CA0EBEB.1030204@austin.ibm.com>

Add a new mutex for use in adding and removing of memory blocks.  This
is needed to avoid any race conditions in which the same memory block could
be added and removed at the same time.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>

---
 drivers/base/memory.c |    7 +++++++
 1 file changed, 7 insertions(+)

Index: linux-next/drivers/base/memory.c
===================================================================
--- linux-next.orig/drivers/base/memory.c	2010-09-27 09:31:35.000000000 -0500
+++ linux-next/drivers/base/memory.c	2010-09-27 09:31:57.000000000 -0500
@@ -27,6 +27,8 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 
+static DEFINE_MUTEX(mem_sysfs_mutex);
+
 #define MEMORY_CLASS_NAME	"memory"
 
 static struct sysdev_class memory_sysdev_class = {
@@ -476,6 +478,8 @@
 	if (!mem)
 		return -ENOMEM;
 
+	mutex_lock(&mem_sysfs_mutex);
+
 	mem->phys_index = __section_nr(section);
 	mem->state = state;
 	atomic_inc(&mem->section_count);
@@ -497,6 +501,7 @@
 			ret = register_mem_sect_under_node(mem, nid);
 	}
 
+	mutex_unlock(&mem_sysfs_mutex);
 	return ret;
 }
 
@@ -505,6 +510,7 @@
 {
 	struct memory_block *mem;
 
+	mutex_lock(&mem_sysfs_mutex);
 	mem = find_memory_block(section);
 
 	if (atomic_dec_and_test(&mem->section_count)) {
@@ -516,6 +522,7 @@
 		unregister_memory(mem, section);
 	}
 
+	mutex_unlock(&mem_sysfs_mutex);
 	return 0;
 }

^ permalink raw reply

* [PATCH 2/8] v2 Add section count to memory_block struct
From: Nathan Fontenot @ 2010-09-27 19:22 UTC (permalink / raw)
  To: linux-kernel, linux-mm, linuxppc-dev
  Cc: Greg KH, KAMEZAWA Hiroyuki, Dave Hansen
In-Reply-To: <4CA0EBEB.1030204@austin.ibm.com>

Add a section count property to the memory_block struct to track the number
of memory sections that have been added/removed from a memory block. This
allows us to know when the last memory section of a memory block has been
removed so we can remove the memory block.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>

---
 drivers/base/memory.c  |   16 ++++++++++------
 include/linux/memory.h |    3 +++
 2 files changed, 13 insertions(+), 6 deletions(-)

Index: linux-next/drivers/base/memory.c
===================================================================
--- linux-next.orig/drivers/base/memory.c	2010-09-27 09:17:20.000000000 -0500
+++ linux-next/drivers/base/memory.c	2010-09-27 09:31:35.000000000 -0500
@@ -478,6 +478,7 @@
 
 	mem->phys_index = __section_nr(section);
 	mem->state = state;
+	atomic_inc(&mem->section_count);
 	mutex_init(&mem->state_mutex);
 	start_pfn = section_nr_to_pfn(mem->phys_index);
 	mem->phys_device = arch_get_memory_phys_device(start_pfn);
@@ -505,12 +506,15 @@
 	struct memory_block *mem;
 
 	mem = find_memory_block(section);
-	unregister_mem_sect_under_nodes(mem);
-	mem_remove_simple_file(mem, phys_index);
-	mem_remove_simple_file(mem, state);
-	mem_remove_simple_file(mem, phys_device);
-	mem_remove_simple_file(mem, removable);
-	unregister_memory(mem, section);
+
+	if (atomic_dec_and_test(&mem->section_count)) {
+		unregister_mem_sect_under_nodes(mem);
+		mem_remove_simple_file(mem, phys_index);
+		mem_remove_simple_file(mem, state);
+		mem_remove_simple_file(mem, phys_device);
+		mem_remove_simple_file(mem, removable);
+		unregister_memory(mem, section);
+	}
 
 	return 0;
 }
Index: linux-next/include/linux/memory.h
===================================================================
--- linux-next.orig/include/linux/memory.h	2010-09-27 09:17:20.000000000 -0500
+++ linux-next/include/linux/memory.h	2010-09-27 09:22:56.000000000 -0500
@@ -19,10 +19,13 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/mutex.h>
+#include <asm/atomic.h>
 
 struct memory_block {
 	unsigned long phys_index;
 	unsigned long state;
+	atomic_t section_count;
+
 	/*
 	 * This serializes all state change requests.  It isn't
 	 * held during creation because the control files are

^ permalink raw reply

* [PATCH 1/8] v2 Move find_memory_block() routine
From: Nathan Fontenot @ 2010-09-27 19:21 UTC (permalink / raw)
  To: linux-kernel, linux-mm, linuxppc-dev
  Cc: Greg KH, KAMEZAWA Hiroyuki, Dave Hansen
In-Reply-To: <4CA0EBEB.1030204@austin.ibm.com>

Move the find_memory_block() routine up to avoid needing a forward
declaration in subsequent patches.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>

---
 drivers/base/memory.c |   62 +++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

Index: linux-next/drivers/base/memory.c
===================================================================
--- linux-next.orig/drivers/base/memory.c	2010-09-21 11:59:24.000000000 -0500
+++ linux-next/drivers/base/memory.c	2010-09-21 12:32:45.000000000 -0500
@@ -435,6 +435,37 @@ int __weak arch_get_memory_phys_device(u
 	return 0;
 }
 
+/*
+ * For now, we have a linear search to go find the appropriate
+ * memory_block corresponding to a particular phys_index. If
+ * this gets to be a real problem, we can always use a radix
+ * tree or something here.
+ *
+ * This could be made generic for all sysdev classes.
+ */
+struct memory_block *find_memory_block(struct mem_section *section)
+{
+	struct kobject *kobj;
+	struct sys_device *sysdev;
+	struct memory_block *mem;
+	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
+
+	/*
+	 * This only works because we know that section == sysdev->id
+	 * slightly redundant with sysdev_register()
+	 */
+	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
+
+	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
+	if (!kobj)
+		return NULL;
+
+	sysdev = container_of(kobj, struct sys_device, kobj);
+	mem = container_of(sysdev, struct memory_block, sysdev);
+
+	return mem;
+}
+
 static int add_memory_block(int nid, struct mem_section *section,
 			unsigned long state, enum mem_add_context context)
 {
@@ -468,37 +499,6 @@ static int add_memory_block(int nid, str
 	return ret;
 }
 
-/*
- * For now, we have a linear search to go find the appropriate
- * memory_block corresponding to a particular phys_index. If
- * this gets to be a real problem, we can always use a radix
- * tree or something here.
- *
- * This could be made generic for all sysdev classes.
- */
-struct memory_block *find_memory_block(struct mem_section *section)
-{
-	struct kobject *kobj;
-	struct sys_device *sysdev;
-	struct memory_block *mem;
-	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
-
-	/*
-	 * This only works because we know that section == sysdev->id
-	 * slightly redundant with sysdev_register()
-	 */
-	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
-
-	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
-	if (!kobj)
-		return NULL;
-
-	sysdev = container_of(kobj, struct sys_device, kobj);
-	mem = container_of(sysdev, struct memory_block, sysdev);
-
-	return mem;
-}
-
 int remove_memory_block(unsigned long node_id, struct mem_section *section,
 		int phys_device)
 {

^ permalink raw reply

* [PATCH 0/8] v2 De-Couple sysfs memory directories from memory sections
From: Nathan Fontenot @ 2010-09-27 19:09 UTC (permalink / raw)
  To: linux-kernel, linux-mm, linuxppc-dev
  Cc: Greg KH, KAMEZAWA Hiroyuki, Dave Hansen

This set of patches decouples the concept that a single memory
section corresponds to a single directory in 
/sys/devices/system/memory/.  On systems
with large amounts of memory (1+ TB) there are perfomance issues
related to creating the large number of sysfs directories.  For
a powerpc machine with 1 TB of memory we are creating 63,000+
directories.  This is resulting in boot times of around 45-50
minutes for systems with 1 TB of memory and 8 hours for systems
with 2 TB of memory.  With this patch set applied I am now seeing
boot times of 5 minutes or less.

The root of this issue is in sysfs directory creation. Every time
a directory is created a string compare is done against all sibling
directories to ensure we do not create duplicates.  The list of
directory nodes in sysfs is kept as an unsorted list which results
in this being an exponentially longer operation as the number of
directories are created.

The solution solved by this patch set is to allow a single
directory in sysfs to span multiple memory sections.  This is
controlled by an optional architecturally defined function
memory_block_size_bytes().  The default definition of this
routine returns a memory block size equal to the memory section
size. This maintains the current layout of sysfs memory
directories as it appears to userspace to remain the same as it
is today.

For architectures that define their own version of this routine,
as is done for powerpc in this patchset, the view in userspace
would change such that each memoryXXX directory would span
multiple memory sections.  The number of sections spanned would
depend on the value reported by memory_block_size_bytes.

In both cases a new file 'end_phys_index' is created in each
memoryXXX directory.  This file will contain the physical id
of the last memory section covered by the sysfs directory.  For
the default case, the value in 'end_phys_index' will be the same
as in the existing 'phys_index' file.

This version of the patch set includes an update to to properly
report block_size_bytes, phys_index, and end_phys_index.  Additionally,
the patch that adds the end_phys_index sysfs file is now patch 5/8
instead of being patch 2/8 as in the previous version of the patches.

-Nathan Fontenot

^ permalink raw reply

* Re: [PATCH RFCv2 1/2] dmaengine: add support for scatterlist to scatterlist transfers
From: Dan Williams @ 2010-09-27 17:35 UTC (permalink / raw)
  To: Ira W. Snyder; +Cc: Per Fridén, Linus Walleij, linuxppc-dev, linux-kernel
In-Reply-To: <20100927172356.GA805@ovro.caltech.edu>

On Mon, Sep 27, 2010 at 10:23 AM, Ira W. Snyder <iws@ovro.caltech.edu> wrot=
e:
> On Mon, Sep 27, 2010 at 05:23:34PM +0200, Linus Walleij wrote:
>> 2010/9/25 Ira W. Snyder <iws@ovro.caltech.edu>:
>>
>> > This adds support for scatterlist to scatterlist DMA transfers.
>>
>> This is a good idea, we have a local function to do this in DMA40 alread=
y,
>> stedma40_memcpy_sg().
>>
>
> I think that having two devices that want to implement this
> functionality as part of the DMAEngine API is a good argument for making
> it available as part of the core API. I think it would be good to add
> this to struct dma_device, and add a capability (DMA_SG?) for it as
> well.
>
> I have looked at the stedma40_memcpy_sg() function, and I think we would
> want to extend it slightly for the generic API. Is there any good reason
> to prohibit scatterlists with different numbers of elements?
>
> For example:
> src scatterlist: 10 elements, each with 4K length (40K total)
> dst scatterlist: 40 elements, each with 1K length (40K total)
>
> The total length of both scatterlists is equal, but the number of
> scatterlist entries is different. The freescale DMA controller can
> handle this just fine.
>
> I'm proposing this function signature:
> struct dma_async_tx_descriptor *
> dma_memcpy_sg(struct dma_chan *chan,
> =A0 =A0 =A0 =A0 =A0 =A0 =A0struct scatterlist *dst_sg, unsigned int dst_n=
ents,
> =A0 =A0 =A0 =A0 =A0 =A0 =A0struct scatterlist *src_sg, unsigned int src_n=
ents,
> =A0 =A0 =A0 =A0 =A0 =A0 =A0unsigned long flags);
>
>> > This is
>> > currently hidden behind a configuration option, which will allow drive=
rs
>> > which need this functionality to select it individually.
>>
>> Why? Isn't it better to add this as a new capability flag
>> if you don't want to announce it? Or is the intent to save
>> memory footprint?
>>
>
> Dan wanted this, probably for memory footprint. If >1 driver is using
> it,

Yes, I did not see a reason to increment the size of dmaengine.o for
everyone if only one out-of-tree user of the function existed.

> I would rather have it as part of struct dma_device along with a
> capability.

I think having this as a dma_device method makes sense now that more
than one driver would implement it, and let's drivers see the entirety
of the transaction in one call.

--
Dan

^ permalink raw reply

* Re: [PATCH RFCv2 1/2] dmaengine: add support for scatterlist to scatterlist transfers
From: Ira W. Snyder @ 2010-09-27 17:23 UTC (permalink / raw)
  To: Linus Walleij; +Cc: Per Fridén, Dan Williams, linuxppc-dev, linux-kernel
In-Reply-To: <AANLkTimzdEhAZ=oXbbB3Ge5=YY7OXq3fjaRH4roPNB-k@mail.gmail.com>

On Mon, Sep 27, 2010 at 05:23:34PM +0200, Linus Walleij wrote:
> 2010/9/25 Ira W. Snyder <iws@ovro.caltech.edu>:
> 
> > This adds support for scatterlist to scatterlist DMA transfers.
> 
> This is a good idea, we have a local function to do this in DMA40 already,
> stedma40_memcpy_sg().
> 

I think that having two devices that want to implement this
functionality as part of the DMAEngine API is a good argument for making
it available as part of the core API. I think it would be good to add
this to struct dma_device, and add a capability (DMA_SG?) for it as
well.

I have looked at the stedma40_memcpy_sg() function, and I think we would
want to extend it slightly for the generic API. Is there any good reason
to prohibit scatterlists with different numbers of elements?

For example:
src scatterlist: 10 elements, each with 4K length (40K total)
dst scatterlist: 40 elements, each with 1K length (40K total)

The total length of both scatterlists is equal, but the number of
scatterlist entries is different. The freescale DMA controller can
handle this just fine.

I'm proposing this function signature:
struct dma_async_tx_descriptor *
dma_memcpy_sg(struct dma_chan *chan,
	      struct scatterlist *dst_sg, unsigned int dst_nents,
	      struct scatterlist *src_sg, unsigned int src_nents,
	      unsigned long flags);

> > This is
> > currently hidden behind a configuration option, which will allow drivers
> > which need this functionality to select it individually.
> 
> Why? Isn't it better to add this as a new capability flag
> if you don't want to announce it? Or is the intent to save
> memory footprint?
> 

Dan wanted this, probably for memory footprint. If >1 driver is using
it, I would rather have it as part of struct dma_device along with a
capability.

Thanks for the feedback,
Ira

^ permalink raw reply

* Re: [PATCH v6 0/8] ptp: IEEE 1588 hardware clock support
From: Alan Cox @ 2010-09-27 17:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: John Stultz, Rodolfo Giometti, Arnd Bergmann, Peter Zijlstra,
	linux-api, devicetree-discuss, linux-kernel, David Miller, netdev,
	Thomas Gleixner, linuxppc-dev, Richard Cochran, linux-arm-kernel,
	Krzysztof Halasa
In-Reply-To: <alpine.DEB.2.00.1009271054320.9258@router.home>

On Mon, 27 Sep 2010 10:56:09 -0500 (CDT)
Christoph Lameter <cl@linux.com> wrote:

> 
> On Fri, 24 Sep 2010, Alan Cox wrote:
> 
> > Whether you add new syscalls or do the fd passing using flags and hide
> > the ugly bits in glibc is another question.
> 
> Use device specific ioctls instead of syscalls?

Some of the ioctls are probably not device specific, the job of the OS in
part is to present a unified interface. We already have a mess of HPET
and RTC driver ioctls.

Some of it undoubtedly is device specific.

Alan

^ permalink raw reply

* Re: [PATCH v6 0/8] ptp: IEEE 1588 hardware clock support
From: M. Warner Losh @ 2010-09-27 16:14 UTC (permalink / raw)
  To: cl
  Cc: richardcochran, peterz, johnstul, devicetree-discuss,
	linuxppc-dev, linux-kernel, netdev, linux-api, tglx, giometti,
	davem, linux-arm-kernel, khc
In-Reply-To: <alpine.DEB.2.00.1009271038150.9258@router.home>

In message: <alpine.DEB.2.00.1009271038150.9258@router.home>
            Christoph Lameter <cl@linux.com> writes:
: On Thu, 23 Sep 2010, john stultz wrote:
: > The design actually avoids most userland induced latency.
: >
: > 1) On the PTP hardware syncing point, the reference packet gets
: > timestamped with the PTP hardware time on arrival. This allows the
: > offset calculation to be done in userland without introducing latency.
: 
: The timestamps allows the calculation of the network transmission time I
: guess and therefore its more accurate to calculate that effect out. Ok but
: then the overhead of getting to code in user space (that does the proper
: clock adjustments) is resulting in the addition of a relatively long time
: that is subject to OS scheduling latencies and noises.

The timestamps at the hardware level allow you to factor out variation
caused by OS Scheduling, OS network stack delay and internal buffering
on the NIC.  Variation in measurements is what kills accuracy.

When steering a clock by making an error measurement of the phase and
frequency of it, the latency induced by OS scheduling tends to be
unimportant.  It is far more important to know when you steered the
clock (called adjtime or friends) than to steer it at any fixed
latency to when the data for the measurements was made.  Measuring the
time of steer can tolerate errors in the range of OS scheduling
latencies easily, since that tends to produce a very small effect.  It
introduces an error in your expected phase for the next measurement on
the order of the product of the time of steer error times the change
in fractional frequency (abs( 1 - (nu_new / nu_old))).  Even if the
estimate is really bad at 100ms, most steers are on the order about
one part per million.  This leads to a sub-nanosecond phase error
estimate in the next measurement cycle (a non-accumulating error).  A
1ms error leads to maybe tens of picoseconds of estimate error.

This is a common error that I've seen repeated in this thread.  The
only reason that it has historically been important is because when
you are doing timestamping in software based on an interrupt, that
stuff does matter.

Warner

^ permalink raw reply

* Re: [PATCH v6 0/8] ptp: IEEE 1588 hardware clock support
From: Christoph Lameter @ 2010-09-27 15:56 UTC (permalink / raw)
  To: Alan Cox
  Cc: John Stultz, Rodolfo Giometti, Arnd Bergmann, Peter Zijlstra,
	linux-api, devicetree-discuss, linux-kernel, David Miller, netdev,
	Thomas Gleixner, linuxppc-dev, Richard Cochran, linux-arm-kernel,
	Krzysztof Halasa
In-Reply-To: <20100924150246.0e6064b6@lxorguk.ukuu.org.uk>


On Fri, 24 Sep 2010, Alan Cox wrote:

> Whether you add new syscalls or do the fd passing using flags and hide
> the ugly bits in glibc is another question.

Use device specific ioctls instead of syscalls?

^ permalink raw reply

* Re: [PATCH v6 0/8] ptp: IEEE 1588 hardware clock support
From: Christoph Lameter @ 2010-09-27 15:52 UTC (permalink / raw)
  To: john stultz
  Cc: Rodolfo Giometti, Arnd Bergmann, Peter Zijlstra, linux-api,
	devicetree-discuss, linux-kernel, David Miller, netdev,
	Thomas Gleixner, linuxppc-dev, Richard Cochran, linux-arm-kernel,
	Krzysztof Halasa
In-Reply-To: <1285278136.2587.154.camel@localhost.localdomain>

On Thu, 23 Sep 2010, john stultz wrote:

> > > 3) Further, the PTP hardware counter can be simply set to a new offset
> > > to put it in line with the network time. This could cause trouble with
> > > timekeeping much like unsynced TSCs do.
> >
> > You can do the same for system time.
>
> Settimeofday does allow CLOCK_REALTIME to jump, but the CLOCK_MONOTONIC
> time cannot jump around. Having a clocksource that is non-monotonic
> would break this.

Currently time runs at the same speed. CLOCK_MONOTONIC runs at a offset
to CLOCK_REALTIME. We are creating APIs here that allow time to run at
different speeds.

> The design actually avoids most userland induced latency.
>
> 1) On the PTP hardware syncing point, the reference packet gets
> timestamped with the PTP hardware time on arrival. This allows the
> offset calculation to be done in userland without introducing latency.

The timestamps allows the calculation of the network transmission time I
guess and therefore its more accurate to calculate that effect out. Ok but
then the overhead of getting to code in user space (that does the proper
clock adjustments) is resulting in the addition of a relatively long time
that is subject to OS scheduling latencies and noises.

> 2) On the system syncing side, the proposal for the PPS interrupt allows
> the PTP hardware to trigger an interrupt on the second boundary that
> would take a timestamp of the system time. Then the pps interface allows
> for the timestamp to be read from userland allowing the offset to be
> calculated without introducing additional latency.

Sorry dont really get the whole picture here it seems. Sounds like one is
going through additional unnecessary layers. Why would the PTP hardware
triggger an interrupt? I thought the PTP messages came in via
timestamping and are then processed by software. Then the software is
issuing a hardware interrupt that then triggers the PPS subsystem. And
that is supposed to be better than directly interfacing with the PTP?


> Additionally, even just in userland, it would be easy to bracket two
> reads of the system time around one read of the PTP clock to bound any
> userland latency fairly well. It may not be as good as the PPS interface
> (although that depends on the interrupt latency), but if the accesses
> are all local, it probably could get fairly close.

That sounds hacky.

> > Ok maybe we need some sort of control interface to manage the clock like
> > the others have.
>
> That's what the clock_adjtime call provides.

Ummm... You are managing a hardware device with hardware (driver) specific
settings. That is currently being done via ioctls. Why generalize it?

> > The posix clocks today assumes one notion of real "time" in the kernel.
> > All clocks increase in lockstep (aside from offset updates).
>
> Not true. The cputime clockids do not increment at the same rate (as the
> apps don't always run). Further CLOCK_MONOTONIC_RAW provides a non-freq
> corrected view of CLOCK_MONOTONIC, so it increments at a slightly
> different rate.

cputime clockids are not tracking time but cpu resource use.

> Re-using the fairly nice (Alan of course disagrees :) posix interface
> seems at least a little better for application developers who actually
> have to use the hardware.

Well it may also be confusing for others. The application developers also
will have a hard time using a generic clock interface to control PTP
device specific things like frequencies, rates etc etc. So you always need
to ioctl/device specific control interface regardless.

^ permalink raw reply

* Re: [PATCH v6 0/8] ptp: IEEE 1588 hardware clock support
From: Christoph Lameter @ 2010-09-27 15:37 UTC (permalink / raw)
  To: Christian Riesch
  Cc: Rodolfo Giometti, Arnd Bergmann, Peter Zijlstra, john stultz,
	devicetree-discuss, linux-kernel, David Miller, netdev,
	linux-arm-kernel, linux-api, Thomas Gleixner, linuxppc-dev,
	Richard Cochran, Alan Cox, Krzysztof Halasa
In-Reply-To: <4C9BC7CE.8020400@riesch.at>

On Thu, 23 Sep 2010, Christian Riesch wrote:

> > > It implies clock tuning in userspace for a potential sub microsecond
> > > accurate clock. The clock accuracy will be limited by user space
> > > latencies and noise. You wont be able to discipline the system clock
> > > accurately.
> >
> > Noise matters, latency doesn't.
>
> Well put! That's why we need hardware support for PTP timestamping to reduce
> the noise, but get along well with the clock servo that is steering the PHC in
> user space.

Even if I buy into the catch phrase above: User space is subject to noise
that the in kernel code is not. If you do the tuning over long intervals
then it hopefully averages out but it still causes jitter effects that
affects the degree of accuracy (or sync) that you can reach. And the noise
varies with the load on the system.

^ permalink raw reply

* Re: [PATCH 1/2] 476: Set CCR2[DSTI] to prevent isync from flushing shadow TLB
From: Dave Kleikamp @ 2010-09-27 15:26 UTC (permalink / raw)
  To: Josh Boyer; +Cc: linuxppc-dev list
In-Reply-To: <20100927150434.GA2888@zod.rchland.ibm.com>

On Mon, 2010-09-27 at 11:04 -0400, Josh Boyer wrote:
> On Fri, Sep 24, 2010 at 01:01:36PM -0500, Dave Kleikamp wrote:
> >When the DSTI (Disable Shadow TLB Invalidate) bit is set in the CCR2
> >register, the isync command does not flush the shadow TLB (iTLB & dTLB).
> >
> >However, since the shadow TLB does not contain context information, we
> >want the shadow TLB flushed in situations where we are switching context.
> >In those situations, we explicitly clear the DSTI bit before performing
> >isync, and set it again afterward.  We also need to do the same when we
> >perform isync after explicitly flushing the TLB.
> >
> >Th setting of the DSTI bit is dependent on
> >CONFIG_PPC_47x_DISABLE_SHADOW_TLB_INVALIDATE.  When we are confident that
> >the feature works as expected, the option can probably be removed.
> 
> You're defaulting it to 'y' in the Kconfig.  Technically someone could
> turn it off I guess, but practice mostly shows that nobody mucks with
> the defaults.  Do you want it to default 'n' for now if you aren't
> confident in it just quite yet?

I think I made it a config option at Ben's request when I first started
this work last year, before being sidetracked by other priorities.  I
could either remove the option, or default it to 'n'.  It might be best
to just hard-code the behavior to make sure it's exercised, since
there's no 47x hardware in production yet, but we can give Ben a chance
to weigh in with his opinion.

> (Linus also has some kind of gripe with new options being default 'y',
> but I don't recall all the details and I doubt he'd care about something
> in low-level PPC code.)
> 
> josh

-- 
Dave Kleikamp
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH RFCv2 1/2] dmaengine: add support for scatterlist to scatterlist transfers
From: Linus Walleij @ 2010-09-27 15:23 UTC (permalink / raw)
  To: Ira W. Snyder; +Cc: Per Fridén, Dan Williams, linuxppc-dev, linux-kernel
In-Reply-To: <1285370032-16937-2-git-send-email-iws@ovro.caltech.edu>

2010/9/25 Ira W. Snyder <iws@ovro.caltech.edu>:

> This adds support for scatterlist to scatterlist DMA transfers.

This is a good idea, we have a local function to do this in DMA40 already,
stedma40_memcpy_sg().

> This is
> currently hidden behind a configuration option, which will allow drivers
> which need this functionality to select it individually.

Why? Isn't it better to add this as a new capability flag
if you don't want to announce it? Or is the intent to save
memory footprint?

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH 1/2] 476: Set CCR2[DSTI] to prevent isync from flushing shadow TLB
From: Josh Boyer @ 2010-09-27 15:04 UTC (permalink / raw)
  To: Dave Kleikamp; +Cc: linuxppc-dev list
In-Reply-To: <1285351297-9999-2-git-send-email-shaggy@linux.vnet.ibm.com>

On Fri, Sep 24, 2010 at 01:01:36PM -0500, Dave Kleikamp wrote:
>When the DSTI (Disable Shadow TLB Invalidate) bit is set in the CCR2
>register, the isync command does not flush the shadow TLB (iTLB & dTLB).
>
>However, since the shadow TLB does not contain context information, we
>want the shadow TLB flushed in situations where we are switching context.
>In those situations, we explicitly clear the DSTI bit before performing
>isync, and set it again afterward.  We also need to do the same when we
>perform isync after explicitly flushing the TLB.
>
>Th setting of the DSTI bit is dependent on
>CONFIG_PPC_47x_DISABLE_SHADOW_TLB_INVALIDATE.  When we are confident that
>the feature works as expected, the option can probably be removed.

You're defaulting it to 'y' in the Kconfig.  Technically someone could
turn it off I guess, but practice mostly shows that nobody mucks with
the defaults.  Do you want it to default 'n' for now if you aren't
confident in it just quite yet?

(Linus also has some kind of gripe with new options being default 'y',
but I don't recall all the details and I doubt he'd care about something
in low-level PPC code.)

josh

^ permalink raw reply

* Re: Oops in trace_hardirqs_on (powerpc)
From: Jörg Sommer @ 2010-09-27 12:50 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Ingo Molnar, linux-kernel, linuxppc-dev
In-Reply-To: <1285184671.26872.127.camel@gandalf.stny.rr.com>

[-- Attachment #1: Type: text/plain, Size: 720 bytes --]

Hello Steven,

Steven Rostedt hat am Wed 22. Sep, 15:44 (-0400) geschrieben:
> Sorry for the late reply, but I was on vacation when you sent this, and
> I missed it while going through email.
> 
> Do you still have this issue?

No. I've rebuild my kernel without TRACE_IRQFLAGS and the problem
vanished, as expected. The problem is, that in some cases the stack is
only two frames deep, which causes the macro CALLER_ADDR1 makes an
invalid access. Someone told me, there a workaround for the problem on
i386, too.

% sed -n 2p arch/x86/lib/thunk_32.S
 * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)

Bye, Jörg.
-- 
Angenehme Worte sind nie wahr,
wahre Worte sind nie angenehm.

[-- Attachment #2: Digital signature http://en.wikipedia.org/wiki/OpenPGP --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: MPC8641D PEX: programming OWBAR in Endpoint mode?
From: tiejun.chen @ 2010-09-26 10:14 UTC (permalink / raw)
  To: David Hagood; +Cc: linuxppc-dev
In-Reply-To: <1285415473.2454.15.camel@chumley>

David Hagood wrote:
> On Sat, 2010-09-25 at 17:46 +0800, tiejun.chen wrote:
>> As a summary you have no any issue to access InBound/BAR/LAW.
> 
> Correct: full access to the inbound.
>> And I remember there is only one
>> requirement to OutBound, and that is the window address should be aligned based
>> on the size from OWS. Are you sure?
> Yes. As my first test I've been using 4k and 64k sizes, aligned on 16M
> boundaries.
>> And did you try to configure OutBound REGS when RC mode? If so I'm afraid we may
>> miss some errata on EP.
> In RC mode the built-in drivers take over.

Can you confirm the build-in driver can write/read OutBound correctly on your
target?

-Tiejun

> 
> 
> 

^ permalink raw reply

* Re: [PATCH 1/1]  Add config option for batched hcalls
From: Olof Johansson @ 2010-09-26  3:49 UTC (permalink / raw)
  To: Will Schmidt; +Cc: linuxppc-dev, Anton Blanchard
In-Reply-To: <1285364655.3016.25.camel@lexx>

On Fri, Sep 24, 2010 at 04:44:15PM -0500, Will Schmidt wrote:
> 
> Add a config option for the (batched) MULTITCE and BULK_REMOVE h-calls.
> 
> By default, these options are on and are beneficial for performance and
> throughput reasons.   If disabled, the code will fall back to using less
> optimal TCE and REMOVE hcalls.   The ability to easily disable these
> options is useful for some of the PREEMPT_RT related investigation and
> work occurring on Power.

Hi,

I can see why it's useful to enable and disable, but these are all
runtime-checked, wouldn't it be more useful to add a bootarg to handle
it instead of adding some new config options that pretty much everyone
will always go with the defaults on?

The bits are set early, but from looking at where they're used, there
doesn't seem to be any harm in disabling them later on when a bootarg
is convenient to parse and deal with?

It has the benefit of easier on/off testing, if that has any value for
production debug down the road.


-Olof

^ permalink raw reply

* Re: [PATCH] powerpc/5121: pdm360ng: fix touch irq if 8xxx gpio driver is enabled
From: Anatolij Gustschin @ 2010-09-25 20:22 UTC (permalink / raw)
  To: Grant Likely; +Cc: linuxppc-dev
In-Reply-To: <20100916023823.GB7049@angua.secretlab.ca>

On Wed, 15 Sep 2010 20:38:23 -0600
Grant Likely <grant.likely@secretlab.ca> wrote:

> On Wed, Sep 15, 2010 at 10:12:57PM +0200, Anatolij Gustschin wrote:
> > Enabling the MPC8xxx GPIO driver with MPC512x GPIO extension
> > breaks touch screen support on this board since the GPIO
> > interrupt will be mapped to 8xxx GPIO irq host resulting in
> > a not requestable interrupt in the touch screen driver. Fix
> > it by mapping the touch interrupt on 8xxx GPIO irq host.
> 
> This looks wrong to me.  The touchscreen code should not go mucking
> about in the GPIO controller registers; that is the job of the gpio
> driver.

But if there is no GPIO driver (as it was the case before adding
mpc512x support in the 8xxx gpio driver) or if the driver is not
enabled in the kernel configuration? Then the platform specific
callback (called from touchscreen driver) returns the pin state
and acknowlegdes the interrupt.

>  What is the reason that the touch interrupt isn't
> requestable?

The 8xxx gpio driver sets up gpio irq host and installs
the chained irq handler for GPIO interrupt 78 using
set_irq_chained_handler() which sets the status field of
the irq_desc structure to IRQ_NOREQUEST | IRQ_NOPROBE.
Other drivers can't request this GPIO interrupt any more,
request_threaded_irq() checks the IRQ_NOREQUEST status
flag and returns -EINVAL if it is set. The gpio interrupts
for each gpio pin are now handled by the
mpc8xxx_gpio_irq_cascade() handler as they should.

>  It looks like the 8xxx gpio driver is designed to hand
> out a separate virq number for each gpio pin (I've not had time to dig
> into details, so you'll need to educate me on the problem details)

Yes, exactly. This patch adds code to request the
board's pen_down gpio pin and to use it's virq number in
the touchscreen driver. The touchscreen driver can
request this virq interrupt and it is now properly handled
by the chained handler in the gpio driver.

Anatolij

^ permalink raw reply

* Re: MPC8641D PEX: programming OWBAR in Endpoint mode?
From: David Hagood @ 2010-09-25 11:51 UTC (permalink / raw)
  To: tiejun.chen; +Cc: linuxppc-dev
In-Reply-To: <4C9DC510.8010403@windriver.com>

On Sat, 2010-09-25 at 17:46 +0800, tiejun.chen wrote:
> As a summary you have no any issue to access InBound/BAR/LAW.

Correct: full access to the inbound.
> And I remember there is only one
> requirement to OutBound, and that is the window address should be aligned based
> on the size from OWS. Are you sure?
Yes. As my first test I've been using 4k and 64k sizes, aligned on 16M
boundaries.
> 
> And did you try to configure OutBound REGS when RC mode? If so I'm afraid we may
> miss some errata on EP.
In RC mode the built-in drivers take over.

^ permalink raw reply

* Re: MPC8641D PEX: programming OWBAR in Endpoint mode?
From: tiejun.chen @ 2010-09-25  9:46 UTC (permalink / raw)
  To: David Hagood; +Cc: linuxppc-dev
In-Reply-To: <1285325425.2454.11.camel@chumley>

David Hagood wrote:
> On Fri, 2010-09-24 at 07:09 +0200, Chen, Tiejun wrote:
> 
>> Right but this should be done for RC mode, not for EP mode we're
>> discussing.
>>
>> Tiejun
> 
> According to the Freescale documentation, outbound is just as valid for
> endpoint as for root complex - indeed, to generate MSIs from software

Yes.

As a summary you have no any issue to access InBound/BAR/LAW. And actually I'm
always strange that even you cannot configure InBound/BAR/LAW properly, you also
should be allowed to write OutBound REGs. And I remember there is only one
requirement to OutBound, and that is the window address should be aligned based
on the size from OWS. Are you sure?

And did you try to configure OutBound REGS when RC mode? If so I'm afraid we may
miss some errata on EP.

And do you have any response from Freescale?

Cheers
Tiejun

> REQUIRES programming an outbound ATMU to access the host's APIC.
> 
> Moreover, ANY PCI endpoint SHOULD be able to do bus master access, and
> that is done by the outbound ATMUs.
> 
> 
> 

^ permalink raw reply

* [PATCH RFCv2 2/2] fsldma: use generic support for scatterlist to scatterlist transfers
From: Ira W. Snyder @ 2010-09-24 23:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dan Williams, linuxppc-dev
In-Reply-To: <1285370032-16937-1-git-send-email-iws@ovro.caltech.edu>

The fsldma driver uses the DMA_SLAVE API to handle scatterlist to
scatterlist DMA transfers. For quite a while now, it has been possible
to mimic the operation by using the device_prep_dma_memcpy() routine
intelligently.

Now that the DMAEngine API has grown generic support for scatterlist to
scatterlist transfers, this operation is no longer needed. The generic
support is used for scatterlist to scatterlist transfers.

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
---
 arch/powerpc/include/asm/fsldma.h |  115 ++------------------
 drivers/dma/fsldma.c              |  219 +++++++------------------------------
 2 files changed, 48 insertions(+), 286 deletions(-)

diff --git a/arch/powerpc/include/asm/fsldma.h b/arch/powerpc/include/asm/fsldma.h
index debc5ed..dc0bd27 100644
--- a/arch/powerpc/include/asm/fsldma.h
+++ b/arch/powerpc/include/asm/fsldma.h
@@ -1,7 +1,7 @@
 /*
  * Freescale MPC83XX / MPC85XX DMA Controller
  *
- * Copyright (c) 2009 Ira W. Snyder <iws@ovro.caltech.edu>
+ * Copyright (c) 2009-2010 Ira W. Snyder <iws@ovro.caltech.edu>
  *
  * This file is licensed under the terms of the GNU General Public License
  * version 2. This program is licensed "as is" without any warranty of any
@@ -11,127 +11,32 @@
 #ifndef __ARCH_POWERPC_ASM_FSLDMA_H__
 #define __ARCH_POWERPC_ASM_FSLDMA_H__
 
-#include <linux/slab.h>
 #include <linux/dmaengine.h>
 
 /*
- * Definitions for the Freescale DMA controller's DMA_SLAVE implemention
+ * The Freescale DMA controller has several features that are not accomodated
+ * in the Linux DMAEngine API. Therefore, the generic structure is expanded
+ * to allow drivers to use these features.
  *
- * The Freescale DMA_SLAVE implementation was designed to handle many-to-many
- * transfers. An example usage would be an accelerated copy between two
- * scatterlists. Another example use would be an accelerated copy from
- * multiple non-contiguous device buffers into a single scatterlist.
+ * This structure should be passed into the DMAEngine routine device_control()
+ * as in this example:
  *
- * A DMA_SLAVE transaction is defined by a struct fsl_dma_slave. This
- * structure contains a list of hardware addresses that should be copied
- * to/from the scatterlist passed into device_prep_slave_sg(). The structure
- * also has some fields to enable hardware-specific features.
+ * chan->device->device_control(chan, DMA_SLAVE_CONFIG, (unsigned long)cfg);
  */
 
 /**
- * struct fsl_dma_hw_addr
- * @entry: linked list entry
- * @address: the hardware address
- * @length: length to transfer
- *
- * Holds a single physical hardware address / length pair for use
- * with the DMAEngine DMA_SLAVE API.
- */
-struct fsl_dma_hw_addr {
-	struct list_head entry;
-
-	dma_addr_t address;
-	size_t length;
-};
-
-/**
  * struct fsl_dma_slave
- * @addresses: a linked list of struct fsl_dma_hw_addr structures
+ * @config: the standard Linux DMAEngine API DMA_SLAVE configuration
  * @request_count: value for DMA request count
- * @src_loop_size: setup and enable constant source-address DMA transfers
- * @dst_loop_size: setup and enable constant destination address DMA transfers
  * @external_start: enable externally started DMA transfers
  * @external_pause: enable externally paused DMA transfers
- *
- * Holds a list of address / length pairs for use with the DMAEngine
- * DMA_SLAVE API implementation for the Freescale DMA controller.
  */
-struct fsl_dma_slave {
+struct fsldma_slave_config {
+	struct dma_slave_config config;
 
-	/* List of hardware address/length pairs */
-	struct list_head addresses;
-
-	/* Support for extra controller features */
 	unsigned int request_count;
-	unsigned int src_loop_size;
-	unsigned int dst_loop_size;
 	bool external_start;
 	bool external_pause;
 };
 
-/**
- * fsl_dma_slave_append - add an address/length pair to a struct fsl_dma_slave
- * @slave: the &struct fsl_dma_slave to add to
- * @address: the hardware address to add
- * @length: the length of bytes to transfer from @address
- *
- * Add a hardware address/length pair to a struct fsl_dma_slave. Returns 0 on
- * success, -ERRNO otherwise.
- */
-static inline int fsl_dma_slave_append(struct fsl_dma_slave *slave,
-				       dma_addr_t address, size_t length)
-{
-	struct fsl_dma_hw_addr *addr;
-
-	addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
-	if (!addr)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&addr->entry);
-	addr->address = address;
-	addr->length = length;
-
-	list_add_tail(&addr->entry, &slave->addresses);
-	return 0;
-}
-
-/**
- * fsl_dma_slave_free - free a struct fsl_dma_slave
- * @slave: the struct fsl_dma_slave to free
- *
- * Free a struct fsl_dma_slave and all associated address/length pairs
- */
-static inline void fsl_dma_slave_free(struct fsl_dma_slave *slave)
-{
-	struct fsl_dma_hw_addr *addr, *tmp;
-
-	if (slave) {
-		list_for_each_entry_safe(addr, tmp, &slave->addresses, entry) {
-			list_del(&addr->entry);
-			kfree(addr);
-		}
-
-		kfree(slave);
-	}
-}
-
-/**
- * fsl_dma_slave_alloc - allocate a struct fsl_dma_slave
- * @gfp: the flags to pass to kmalloc when allocating this structure
- *
- * Allocate a struct fsl_dma_slave for use by the DMA_SLAVE API. Returns a new
- * struct fsl_dma_slave on success, or NULL on failure.
- */
-static inline struct fsl_dma_slave *fsl_dma_slave_alloc(gfp_t gfp)
-{
-	struct fsl_dma_slave *slave;
-
-	slave = kzalloc(sizeof(*slave), gfp);
-	if (!slave)
-		return NULL;
-
-	INIT_LIST_HEAD(&slave->addresses);
-	return slave;
-}
-
 #endif /* __ARCH_POWERPC_ASM_FSLDMA_H__ */
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index cea08be..c90b393 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -599,207 +599,64 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg(
 	struct dma_chan *dchan, struct scatterlist *sgl, unsigned int sg_len,
 	enum dma_data_direction direction, unsigned long flags)
 {
-	struct fsldma_chan *chan;
-	struct fsl_desc_sw *first = NULL, *prev = NULL, *new = NULL;
-	struct fsl_dma_slave *slave;
-	size_t copy;
-
-	int i;
-	struct scatterlist *sg;
-	size_t sg_used;
-	size_t hw_used;
-	struct fsl_dma_hw_addr *hw;
-	dma_addr_t dma_dst, dma_src;
-
-	if (!dchan)
-		return NULL;
-
-	if (!dchan->private)
-		return NULL;
-
-	chan = to_fsl_chan(dchan);
-	slave = dchan->private;
-
-	if (list_empty(&slave->addresses))
-		return NULL;
-
-	hw = list_first_entry(&slave->addresses, struct fsl_dma_hw_addr, entry);
-	hw_used = 0;
-
 	/*
-	 * Build the hardware transaction to copy from the scatterlist to
-	 * the hardware, or from the hardware to the scatterlist
+	 * This operation is not supported on the Freescale DMA controller
 	 *
-	 * If you are copying from the hardware to the scatterlist and it
-	 * takes two hardware entries to fill an entire page, then both
-	 * hardware entries will be coalesced into the same page
-	 *
-	 * If you are copying from the scatterlist to the hardware and a
-	 * single page can fill two hardware entries, then the data will
-	 * be read out of the page into the first hardware entry, and so on
+	 * However, we need to provide the function pointer to allow the
+	 * device_control() method to work.
 	 */
-	for_each_sg(sgl, sg, sg_len, i) {
-		sg_used = 0;
-
-		/* Loop until the entire scatterlist entry is used */
-		while (sg_used < sg_dma_len(sg)) {
-
-			/*
-			 * If we've used up the current hardware address/length
-			 * pair, we need to load a new one
-			 *
-			 * This is done in a while loop so that descriptors with
-			 * length == 0 will be skipped
-			 */
-			while (hw_used >= hw->length) {
-
-				/*
-				 * If the current hardware entry is the last
-				 * entry in the list, we're finished
-				 */
-				if (list_is_last(&hw->entry, &slave->addresses))
-					goto finished;
-
-				/* Get the next hardware address/length pair */
-				hw = list_entry(hw->entry.next,
-						struct fsl_dma_hw_addr, entry);
-				hw_used = 0;
-			}
-
-			/* Allocate the link descriptor from DMA pool */
-			new = fsl_dma_alloc_descriptor(chan);
-			if (!new) {
-				dev_err(chan->dev, "No free memory for "
-						       "link descriptor\n");
-				goto fail;
-			}
-#ifdef FSL_DMA_LD_DEBUG
-			dev_dbg(chan->dev, "new link desc alloc %p\n", new);
-#endif
-
-			/*
-			 * Calculate the maximum number of bytes to transfer,
-			 * making sure it is less than the DMA controller limit
-			 */
-			copy = min_t(size_t, sg_dma_len(sg) - sg_used,
-					     hw->length - hw_used);
-			copy = min_t(size_t, copy, FSL_DMA_BCR_MAX_CNT);
-
-			/*
-			 * DMA_FROM_DEVICE
-			 * from the hardware to the scatterlist
-			 *
-			 * DMA_TO_DEVICE
-			 * from the scatterlist to the hardware
-			 */
-			if (direction == DMA_FROM_DEVICE) {
-				dma_src = hw->address + hw_used;
-				dma_dst = sg_dma_address(sg) + sg_used;
-			} else {
-				dma_src = sg_dma_address(sg) + sg_used;
-				dma_dst = hw->address + hw_used;
-			}
-
-			/* Fill in the descriptor */
-			set_desc_cnt(chan, &new->hw, copy);
-			set_desc_src(chan, &new->hw, dma_src);
-			set_desc_dst(chan, &new->hw, dma_dst);
-
-			/*
-			 * If this is not the first descriptor, chain the
-			 * current descriptor after the previous descriptor
-			 */
-			if (!first) {
-				first = new;
-			} else {
-				set_desc_next(chan, &prev->hw,
-					      new->async_tx.phys);
-			}
-
-			new->async_tx.cookie = 0;
-			async_tx_ack(&new->async_tx);
-
-			prev = new;
-			sg_used += copy;
-			hw_used += copy;
-
-			/* Insert the link descriptor into the LD ring */
-			list_add_tail(&new->node, &first->tx_list);
-		}
-	}
-
-finished:
-
-	/* All of the hardware address/length pairs had length == 0 */
-	if (!first || !new)
-		return NULL;
-
-	new->async_tx.flags = flags;
-	new->async_tx.cookie = -EBUSY;
-
-	/* Set End-of-link to the last link descriptor of new list */
-	set_ld_eol(chan, new);
-
-	/* Enable extra controller features */
-	if (chan->set_src_loop_size)
-		chan->set_src_loop_size(chan, slave->src_loop_size);
-
-	if (chan->set_dst_loop_size)
-		chan->set_dst_loop_size(chan, slave->dst_loop_size);
-
-	if (chan->toggle_ext_start)
-		chan->toggle_ext_start(chan, slave->external_start);
-
-	if (chan->toggle_ext_pause)
-		chan->toggle_ext_pause(chan, slave->external_pause);
-
-	if (chan->set_request_count)
-		chan->set_request_count(chan, slave->request_count);
-
-	return &first->async_tx;
-
-fail:
-	/* If first was not set, then we failed to allocate the very first
-	 * descriptor, and we're done */
-	if (!first)
-		return NULL;
-
-	/*
-	 * First is set, so all of the descriptors we allocated have been added
-	 * to first->tx_list, INCLUDING "first" itself. Therefore we
-	 * must traverse the list backwards freeing each descriptor in turn
-	 *
-	 * We're re-using variables for the loop, oh well
-	 */
-	fsldma_free_desc_list_reverse(chan, &first->tx_list);
 	return NULL;
 }
 
 static int fsl_dma_device_control(struct dma_chan *dchan,
 				  enum dma_ctrl_cmd cmd, unsigned long arg)
 {
+	struct fsldma_slave_config *cfg;
 	struct fsldma_chan *chan;
 	unsigned long flags;
 
-	/* Only supports DMA_TERMINATE_ALL */
-	if (cmd != DMA_TERMINATE_ALL)
-		return -ENXIO;
-
 	if (!dchan)
 		return -EINVAL;
 
 	chan = to_fsl_chan(dchan);
 
-	/* Halt the DMA engine */
-	dma_halt(chan);
+	switch (cmd) {
+	case DMA_TERMINATE_ALL:
+		/* Halt the DMA engine */
+		dma_halt(chan);
 
-	spin_lock_irqsave(&chan->desc_lock, flags);
+		spin_lock_irqsave(&chan->desc_lock, flags);
 
-	/* Remove and free all of the descriptors in the LD queue */
-	fsldma_free_desc_list(chan, &chan->ld_pending);
-	fsldma_free_desc_list(chan, &chan->ld_running);
+		/* Remove and free all of the descriptors in the LD queue */
+		fsldma_free_desc_list(chan, &chan->ld_pending);
+		fsldma_free_desc_list(chan, &chan->ld_running);
 
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+		spin_unlock_irqrestore(&chan->desc_lock, flags);
+		return 0;
+
+	case DMA_SLAVE_CONFIG:
+
+		cfg = (struct fsldma_slave_config *)arg;
+		if (chan->set_request_count)
+			chan->set_request_count(chan, cfg->request_count);
+
+		if (chan->toggle_ext_start)
+			chan->toggle_ext_start(chan, cfg->external_start);
+
+		if (chan->toggle_ext_pause)
+			chan->toggle_ext_pause(chan, cfg->external_pause);
+
+		/*
+		 * TODO: add other features
+		 *
+		 * I'm not sure how to use the members dma_slave_config to
+		 * control the src/dst address hold features.
+		 */
+		return 0;
+
+	default:
+		return -ENXIO;
+	}
 
 	return 0;
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH RFCv2 1/2] dmaengine: add support for scatterlist to scatterlist transfers
From: Ira W. Snyder @ 2010-09-24 23:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dan Williams, linuxppc-dev
In-Reply-To: <1285370032-16937-1-git-send-email-iws@ovro.caltech.edu>

This adds support for scatterlist to scatterlist DMA transfers. This is
currently hidden behind a configuration option, which will allow drivers
which need this functionality to select it individually.

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
---
 drivers/dma/Kconfig       |    3 +
 drivers/dma/dmaengine.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h |    6 ++
 3 files changed, 134 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 9520cf0..82d2244 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -89,6 +89,9 @@ config AT_HDMAC
 	  Support the Atmel AHB DMA controller.  This can be integrated in
 	  chips such as the Atmel AT91SAM9RL.
 
+config DMAENGINE_SG_TO_SG
+	bool
+
 config FSL_DMA
 	tristate "Freescale Elo and Elo Plus DMA support"
 	depends on FSL_SOC
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 9d31d5e..9238b86 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -972,6 +972,131 @@ dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg,
 }
 EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
 
+#ifdef CONFIG_DMAENGINE_SG_TO_SG
+dma_cookie_t
+dma_async_memcpy_sg_to_sg(struct dma_chan *chan,
+			  struct scatterlist *dst_sg, unsigned int dst_nents,
+			  struct scatterlist *src_sg, unsigned int src_nents,
+			  dma_async_tx_callback cb, void *cb_param)
+{
+	struct dma_device *dev = chan->device;
+	struct dma_async_tx_descriptor *tx;
+	dma_cookie_t cookie = -ENOMEM;
+	size_t dst_avail, src_avail;
+	struct scatterlist *sg;
+	size_t transferred = 0;
+	size_t dst_total = 0;
+	size_t src_total = 0;
+	dma_addr_t dst, src;
+	size_t len;
+	int i;
+
+	if (dst_nents == 0 || src_nents == 0)
+		return -EINVAL;
+
+	if (dst_sg == NULL || src_sg == NULL)
+		return -EINVAL;
+
+	/* get the total count of bytes in each scatterlist */
+	for_each_sg(dst_sg, sg, dst_nents, i)
+		dst_total += sg_dma_len(sg);
+
+	for_each_sg(src_sg, sg, src_nents, i)
+		src_total += sg_dma_len(sg);
+
+	/* get prepared for the loop */
+	dst_avail = sg_dma_len(dst_sg);
+	src_avail = sg_dma_len(src_sg);
+
+	/* run until we are out of descriptors */
+	while (true) {
+
+		/* create the largest transaction possible */
+		len = min_t(size_t, src_avail, dst_avail);
+		if (len == 0)
+			goto fetch;
+
+		dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) - dst_avail;
+		src = sg_dma_address(src_sg) + sg_dma_len(src_sg) - src_avail;
+
+		/*
+		 * get a descriptor
+		 *
+		 * we must poll for a descriptor here since the DMAEngine API
+		 * does not provide a way for external users to free previously
+		 * allocated descriptors
+		 */
+		for (;;) {
+			tx = dev->device_prep_dma_memcpy(chan, dst, src, len, 0);
+			if (likely(tx))
+				break;
+
+			dma_async_issue_pending(chan);
+		}
+
+		/* update metadata */
+		transferred += len;
+		dst_avail -= len;
+		src_avail -= len;
+
+		/* if this is the last transfer, setup the callback */
+		if (dst_total == transferred || src_total == transferred) {
+			tx->callback = cb;
+			tx->callback_param = cb_param;
+		}
+
+		/* submit the transaction */
+		cookie = tx->tx_submit(tx);
+		if (dma_submit_error(cookie)) {
+			dev_err(dev->dev, "failed to submit desc\n");
+			return cookie;
+		}
+
+fetch:
+		/* fetch the next dst scatterlist entry */
+		if (dst_avail == 0) {
+
+			/* no more entries: we're done */
+			if (dst_nents == 0)
+				break;
+
+			/* fetch the next entry: if there are no more: done */
+			dst_sg = sg_next(dst_sg);
+			if (dst_sg == NULL)
+				break;
+
+			dst_nents--;
+			dst_avail = sg_dma_len(dst_sg);
+		}
+
+		/* fetch the next src scatterlist entry */
+		if (src_avail == 0) {
+
+			/* no more entries: we're done */
+			if (src_nents == 0)
+				break;
+
+			/* fetch the next entry: if there are no more: done */
+			src_sg = sg_next(src_sg);
+			if (src_sg == NULL)
+				break;
+
+			src_nents--;
+			src_avail = sg_dma_len(src_sg);
+		}
+	}
+
+	/* update counters */
+	preempt_disable();
+	__this_cpu_add(chan->local->bytes_transferred, transferred);
+	__this_cpu_inc(chan->local->memcpy_count);
+	preempt_enable();
+
+	return 0;
+}
+EXPORT_SYMBOL(dma_async_memcpy_sg_to_sg);
+#endif
+
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 	struct dma_chan *chan)
 {
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index c61d4ca..28803a0 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -632,6 +632,12 @@ dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
 dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
 	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
 	unsigned int src_off, size_t len);
+#ifdef CONFIG_DMAENGINE_SG_TO_SG
+dma_cookie_t dma_async_memcpy_sg_to_sg(struct dma_chan *chan,
+	struct scatterlist *dst_sg, unsigned int dst_nents,
+	struct scatterlist *src_sg, unsigned int src_nents,
+	dma_async_tx_callback cb, void *cb_param);
+#endif
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 	struct dma_chan *chan);
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH RFCv2 0/2] dma: add support for sg-to-sg transfers
From: Ira W. Snyder @ 2010-09-24 23:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dan Williams, linuxppc-dev

This series adds support for scatterlist to scatterlist transfers to the
generic DMAEngine API. I have hidden it behind a configuration option to
allow specific drivers that need this functionality to enable it.

This series is intended to lay the groundwork for further changes to the
series titled "CARMA Board Support". That series will be updated when I
have time and hardware to test with.

This series has not been runtime tested yet. I am posting it only to
gain comments before I spend the effort to update the driver that
depends on this.

To help reviewers, I'd like to comment on the architecture of
dma_async_memcpy_sg_to_sg(). It explicitly avoids using descriptor
chaining due to the way that feature interacts with the fsldma
controller's external start feature. To use the external start feature
properly, the in-memory descriptor chain must not be fragmented into
multiple smaller chains. This is what is achieved by submitting all
descriptors without using chaining.

An alternative implementation would create a device_prep_sg_to_sg()
function, and use that to allocate all descriptors in one shot. That
implementation would be safer against allocation failures than this one.

I would recommend against committing this until I've tested it on real
hardware.

Ira W. Snyder (2):
  dmaengine: add support for scatterlist to scatterlist transfers
  fsldma: use generic support for scatterlist to scatterlist transfers

 arch/powerpc/include/asm/fsldma.h |  115 ++------------------
 drivers/dma/Kconfig               |    4 +
 drivers/dma/dmaengine.c           |  119 ++++++++++++++++++++
 drivers/dma/fsldma.c              |  219 +++++++------------------------------
 include/linux/dmaengine.h         |   10 ++
 5 files changed, 181 insertions(+), 286 deletions(-)

^ permalink raw reply

* Re: [PATCH RFCv1 1/2] dmaengine: add support for scatterlist to scatterlist transfers
From: Ira W. Snyder @ 2010-09-24 22:53 UTC (permalink / raw)
  To: Dan Williams, linuxppc-dev@lists.ozlabs.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20100924220419.GC24654@ovro.caltech.edu>

On Fri, Sep 24, 2010 at 03:04:19PM -0700, Ira W. Snyder wrote:
> On Fri, Sep 24, 2010 at 02:53:14PM -0700, Dan Williams wrote:
> > On Fri, 2010-09-24 at 14:24 -0700, Ira W. Snyder wrote:
> > > On Fri, Sep 24, 2010 at 01:40:56PM -0700, Dan Williams wrote:
> > > > I don't think any dma channels gracefully handle descriptors that were
> > > > prepped but not submitted.  You would probably need to submit the
> > > > backlog, poll for completion, and then return the error.
> > > > Alternatively, the expectation is that descriptor allocations are
> > > > transient, i.e. once previously submitted transactions are completed
> > > > the descriptors will return to the available pool.  So you could do
> > > > what async_tx routines do and just poll for a descriptor.
> > > >
> > > 
> > > Can you give me an example? Even some pseudocode would help.
> > 
> > Here is one from do_async_gen_syndrome() in crypto/async_tx/async_pq.c:
> > 
> >         /* Since we have clobbered the src_list we are committed
> >          * to doing this asynchronously.  Drivers force forward
> >          * progress in case they can not provide a descriptor
> >          */
> >         for (;;) {
> >                 tx = dma->device_prep_dma_pq(chan, dma_dest,
> >                                              &dma_src[src_off],
> >                                              pq_src_cnt,
> >                                              &coefs[src_off], len,
> >                                              dma_flags);
> >                 if (likely(tx))
> >                         break;  
> >                 async_tx_quiesce(&submit->depend_tx);
> >                 dma_async_issue_pending(chan);
> >         }       
> > 
> > > The other DMAEngine functions (dma_async_memcpy_*()) don't do anything
> > > with the descriptor if submit fails. Take for example
> > > dma_async_memcpy_buf_to_buf(). If tx->tx_submit(tx); fails, any code
> > > using it has no way to return the descriptor to the free pool.
> > > 
> > > Does tx_submit() implicitly return descriptors to the free pool if it
> > > fails?
> > 
> > No, submit() failures are a hold over from when the ioatdma driver used
> > to perform additional descriptor allocation at ->submit() time.  After
> > prep() the expectation is that the engine is just waiting to be told
> > "go" and can't fail.  The only reason ->submit() retains a return code
> > is to support the "cookie" based method for polling for operation
> > completion.  A dma driver should handle all descriptor submission
> > failure scenarios at prep time.
> > 
> 
> Ok, that's more like what I expected. So we still need the try forever
> code similar to the above. I can add that for the next version.
> 

When coding this change, I've noticed one problem that would break my
driver. I cannot issue dma_async_issue_pending() on the channel while
creating the descriptors, since this will start transferring the
previously submitted DMA descriptors. This breaks the external hardware
control requirement.

Imagine this scenario:
1) device is not yet setup for external control (nothing is pulsing the pins)
2) dma_async_memcpy_sg_to_sg()

- this hits an allocation failure, which calls dma_async_issue_pending()
- this causes the DMA engine to start transferring to a device which is
  not ready yet
- memory pressure stops, and allocation succeeds again
- some descriptors have been transferred, but not the ones since the
  alloc failure
- now the first half of the descriptors (pre alloc failure) have been
  transferred
- the second half of the descriptors (post alloc failure) are still
  pending
- the dma_async_memcpy_sg_to_sg() returns success: all tx_submit()
  succeeded

3) device_control() - setup external control mode
4) dma_async_issue_pending() - start the externally controlled transfer
5) tell the external agent to start controlling the DMA transaction

- now there isn't enough data left, and the external agent fails to
  program the FPGAs

I don't mind adding it to the code, since I have enough memory that I
don't ever see allocation failures. It is an embedded system, and we've
been careful not to overcommit memory. I think for all other users, it
would be the appropriate thing to do. Most people don't care if the
scatterlist is copied in two chunks with a time gap in the middle.

An alternative implementation would be to implement
device_prep_sg_to_sg() that returned a struct dma_async_tx_descriptor,
which could then be used as normal by higher layers. This would allow
the driver to allocate / cleanup all descriptors in one shot. This would
be completely robust to this error situation.

Is there one solution you'd prefer over the other? They're both similar
in the amount of code, though duplication would probably be increased in
the device_prep_sg_to_sg() case. If any other driver implements it.

Thanks,
Ira

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox