git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [EXPERIMENTAL PATCH] O(n) bisection algorithm
@ 2005-06-30  8:11 Jon Seymour
  2005-06-30  9:03 ` Jon Seymour
  0 siblings, 1 reply; 2+ messages in thread
From: Jon Seymour @ 2005-06-30  8:11 UTC (permalink / raw)
  To: git; +Cc: torvalds, jon.seymour


This is a rollup of an experimental patch to Linus' HEAD that adds
a O(n) bisection algorithm to git.

For the purposes of comparison, this version:
   * enables debugging (-DDEBUG)
   * adds a --debug switch to git-rev-list
   * adds a --bisect-by-cut switch to git-rev-list

This allows the same git-rev-list binary to be used to compare the existing
O(n^2) behaviour (--bisect) with the O(n) behaviour (--bisect-by-cut).

A measure of complexity of each algorithm can be obtained on stderr,
by using the --debug switch.

My measurements suggest that on a 2000 commit kernel history, this
algorithm is ~ 5 times faster than the O(n^2) algorithm. If this
is true, it should be ~25 times faster on a 10,000 commit kernel history.
This is the difference between a 16 second bisection and a 0.6 second
bisection on that size repository.

This patch also includes my latest topological sort implementation
so there is no need to apply that separately.

Linus: if you are happy to adopt this as the default bisection
algorithm, I'll rework it as a formal patch submission, meaning that I'll:
   * remove the Makefile change
   * remove --debug switch
   * remove the #ifdef'd DEBUG code
   * replace --bisect implementation with --bisect-by-cut implementation
     and remove --bisect-by-cut switch

Cleanly applies to Linus HEAD e4b5c7fff47d3a794249ea2da6b9ec4e33e157f3.

Signed-off-by: Jon Seymour <jon.seymour@gmail.com>
---
Linus: this isn't fit for inclusion in the mainline yet. I have
posted it to the list for review purposes only.
---

 Makefile   |    2 
 commit.c   |  117 ++++++++++++++++++++++++++++
 commit.h   |    5 +
 rev-list.c |  248 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 361 insertions(+), 11 deletions(-)

83df29ec101718325fbb0c6c8e5dcb891a224edf
diff --git a/Makefile b/Makefile
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@
 # break unless your underlying filesystem supports those sub-second times
 # (my ext3 doesn't).
 COPTS=-O2
-CFLAGS=-g $(COPTS) -Wall
+CFLAGS=-DDEBUG -g $(COPTS) -Wall
 
 prefix=$(HOME)
 bin=$(prefix)/bin
diff --git a/commit.c b/commit.c
--- a/commit.c
+++ b/commit.c
@@ -3,6 +3,27 @@
 #include "commit.h"
 #include "cache.h"
 
+struct sort_node
+{
+	/*
+         * the number of children of the associated commit
+         * that also occur in the list being sorted.
+         */
+	unsigned int indegree;
+
+	/*
+         * reference to original list item that we will re-use
+         * on output.
+         */
+	struct commit_list * list_item;
+
+	/*
+         * copy of original object.util pointer that is saved
+         * during the topological sort.
+         */
+	void * save_util;
+};
+
 const char *commit_type = "commit";
 
 enum cmit_fmt get_commit_format(const char *arg)
@@ -346,3 +367,99 @@ int count_parents(struct commit * commit
         return count;
 }
 
+/*
+ * Performs an in-place topological sort on the list supplied
+ * 
+ * Invariant of resulting list is:
+ *        a reachable from b => ord(b) < ord(a)
+ */
+void sort_in_topological_order(struct commit_list ** list)
+{
+	struct commit_list * next = *list;
+	struct commit_list * work = NULL;
+	struct commit_list ** pptr = list;
+	struct sort_node * nodes;
+	struct sort_node * next_nodes;
+	int count = 0;
+
+	/* determine the size of the list */
+	while (next) {
+		next = next->next;
+		count++;
+	}
+	/* allocate an array to help sort the list */
+	nodes = xcalloc(count, sizeof(*nodes));
+	/* link the list to the array */
+	next_nodes = nodes;
+	next=*list;
+	while (next) {
+		next_nodes->list_item = next;
+		next_nodes->save_util = next->item->object.util;
+		next->item->object.util = next_nodes;
+		next_nodes++;
+		next = next->next;
+	}
+	/* update the indegree */
+	next=*list;
+	while (next) {
+		struct commit_list * parents = next->item->parents;
+		while (parents) {
+			struct commit * parent=parents->item;
+			struct sort_node * pn = (struct sort_node *)parent->object.util;
+			
+			if (pn)
+				pn->indegree++;
+			parents=parents->next;
+		}
+		next=next->next;
+	}
+	/* 
+         * find the tips
+         *
+         * tips are nodes not reachable from any other node in the list 
+         * 
+         * the tips serve as a starting set for the work queue.
+         */
+	next=*list;
+	while (next) {
+		struct sort_node * node = (struct sort_node *)next->item->object.util;
+
+		if (node->indegree == 0) {
+			commit_list_insert(next->item, &work);
+		}
+		next=next->next;
+	}
+	/* process the list in topological order */
+	while (work) {
+		struct commit * work_item = pop_commit(&work);
+		struct sort_node * work_node = (struct sort_node *)work_item->object.util;
+		struct commit_list * parents = work_item->parents;
+
+		while (parents) {
+			struct commit * parent=parents->item;
+			struct sort_node * pn = (struct sort_node *)parent->object.util;
+			
+			if (pn) {
+				/* 
+				 * parents are only enqueed for emission 
+                                 * when all their children have been emitted thereby
+                                 * guaranteeing topological order.
+                                 */
+				pn->indegree--;
+				if (!pn->indegree) 
+					commit_list_insert(parent, &work);
+			}
+			parents=parents->next;
+		}
+		/*
+                 * work_item is a commit all of whose children
+                 * have already been emitted. we can emit it now
+                 * and restore its util pointer.
+                 */
+		*pptr = work_node->list_item;
+		pptr = &(*pptr)->next;
+		*pptr = NULL;
+		work_item->object.util = work_node->save_util;
+	}
+	free(nodes);
+}
diff --git a/commit.h b/commit.h
--- a/commit.h
+++ b/commit.h
@@ -55,4 +55,9 @@ struct commit *pop_most_recent_commit(st
 struct commit *pop_commit(struct commit_list **stack);
 
 int count_parents(struct commit * commit);
+
+/*
+ * Performs an in-place topological sort of list supplied.
+ */
+void sort_in_topological_order(struct commit_list ** list);
 #endif /* COMMIT_H */
diff --git a/rev-list.c b/rev-list.c
--- a/rev-list.c
+++ b/rev-list.c
@@ -5,10 +5,19 @@
 #include "blob.h"
 #include "epoch.h"
 
-#define SEEN		(1u << 0)
-#define INTERESTING	(1u << 1)
-#define COUNTED		(1u << 2)
-#define SHOWN		(LAST_EPOCH_FLAG << 2)
+#define ABOVE        (1u<<1)
+struct bisect_by_cut_node 
+{
+	unsigned int flags;
+	unsigned int count;
+	struct commit_list * children;
+	struct commit_list * visible_parents;
+};
+
+#define SEEN        (LAST_EPOCH_FLAG << 1)
+#define INTERESTING (LAST_EPOCH_FLAG << 2)
+#define COUNTED     (LAST_EPOCH_FLAG << 3)
+#define SHOWN       (LAST_EPOCH_FLAG << 4)
 
 static const char rev_list_usage[] =
 	"usage: git-rev-list [OPTION] commit-id <commit-id>\n"
@@ -34,6 +43,13 @@ static enum cmit_fmt commit_format = CMI
 static int merge_order = 0;
 static int show_breaks = 0;
 static int stop_traversal = 0;
+static int bisect_by_cut_option = 0;
+static struct commit_list * topological_order = NULL;
+static struct commit_list ** t_o_tail = &topological_order;
+#ifdef DEBUG
+static int debug = 0;
+static unsigned int complexity = 0;
+#endif
 
 static void show_commit(struct commit *commit)
 {
@@ -95,8 +111,10 @@ static int process_commit(struct commit 
 		return CONTINUE;
 	}
 
-	show_commit(commit);
-
+	if (!bisect_by_cut_option) {
+		show_commit(commit);
+	} else
+		t_o_tail = &commit_list_insert(commit, t_o_tail)->next;
 	return CONTINUE;
 }
 
@@ -264,7 +282,7 @@ static int count_distance(struct commit_
 			while (p) {
 				nr += count_distance(p);
 				p = p->next;
-			}
+			}			
 		}
 	}
 	return nr;
@@ -272,6 +290,9 @@ static int count_distance(struct commit_
 
 static void clear_distance(struct commit_list *list)
 {
+#ifdef DEBUG
+	complexity++;
+#endif
 	while (list) {
 		struct commit *commit = list->item;
 		commit->object.flags &= ~COUNTED;
@@ -403,6 +424,196 @@ static struct commit *get_commit_referen
 	die("%s is unknown object", name);
 }
 
+static inline struct bisect_by_cut_node * get_bisect_by_cut_node(struct commit * commit)
+{
+	return (struct bisect_by_cut_node *)commit->object.util;
+}
+
+#ifdef DEBUG
+static void print_bisect_by_cut_node(struct commit * commit, const char * prefix)
+{
+	struct bisect_by_cut_node * node = get_bisect_by_cut_node(commit);
+	fprintf(stderr, "%s%s %u %d\n", prefix, sha1_to_hex(commit->object.sha1), node->flags, node->count);
+}
+#endif
+
+/*
+ * Find the best bisection point by cutting a topological order in two then
+ * identifying a set of boundary nodes with a reachability known to be 
+ * less than the desired bisection point. The boundary is advanced until all nodes
+ * in the boundary have a reachability greater than or equal to the desired 
+ * reachability.
+ *
+ * This algorithm is roughly O(kn) where k is a factor related to the typical
+ * amount of parallel branching within the graph and is not related to n.
+ *
+ * The algorithm borrows the notion of making an arbitrary cut in the graph
+ * from the Kernighan-Lin (1969) graph bisection algorithm but differs in
+ * other respects. In particular, by using the topological order to make the
+ * cut the width of the advancing boundary is reduced to some kind of minimum.
+ * Full graph scans (e.g. calls to count_distance()) are avoided except where
+ * absolutely necessary (i.e. a merge node is encountered).
+ */
+struct commit * bisect_by_cut(struct commit_list * topological_order)
+{
+	unsigned int count=0;
+	unsigned int i;
+	struct commit_list * next;
+	struct commit_list * work = NULL;
+	struct commit * best = topological_order->item;
+	struct bisect_by_cut_node * nodes;
+	struct bisect_by_cut_node * next_node;
+	struct commit_list counter;
+	unsigned int fittest, goal;
+
+	counter.next=NULL;
+	/* count the size of the topological order */
+	next=topological_order;
+	while (next) {
+		count++;
+		next = next->next;
+	}
+	fittest=count;
+	i=(count+1)/2;
+	goal=count/2;
+	/* allocate and initialize the bisect_by_cut_node structure */
+	next_node=nodes=xmalloc(sizeof(*nodes)*count);
+	memset(nodes, 0, sizeof(*nodes)*count);
+	/* 
+	 * initialize the structures for all nodes of interest
+	 */
+	next=topological_order;
+	next_node=nodes;
+	while (next) {		
+		next->item->object.util=next_node++;
+		next=next->next;
+	}
+	/*
+	 * Mark half the nodes as being above the boundary.
+	 * Mark nodes that aren't in the topological order as uninteresting.
+	 * Initialize lists of visible children and parents.
+	 */
+	next=topological_order;
+	next_node=nodes;
+	while (next) {		
+		struct commit * next_item = next->item;
+		struct commit_list * parents;
+		
+		if (i > 0) {
+			next_node->flags |= ABOVE;
+			i--;
+		}			
+		parents=next_item->parents;
+		while (parents) {
+			struct commit * parent = parents->item;
+			struct bisect_by_cut_node * pn = get_bisect_by_cut_node(parent);
+
+			if (pn) {
+				commit_list_insert(next_item, &pn->children);
+				commit_list_insert(parent, &next_node->visible_parents);
+			} else 
+				parent->object.flags |= UNINTERESTING;
+			parent->object.flags &= ~COUNTED;
+			parents=parents->next;
+		}
+		next=next->next;
+		next_node++;
+	}
+	/*
+	 * Initialize the work queue with commits that are on the boundary
+	 * and with <= the desired reachability.
+	 * 
+	 * We know these commits have less than or equal to the desired 
+	 * reachability because by the definition of the topological order
+	 * nodes can only reach nodes to the right of them in the topological
+	 * order and by construction, there there are no more than 
+	 * (count-1)/2 nodes to the right of each node in the boundary.
+	 */
+	next=topological_order;
+	next_node=nodes;
+	while (next) {
+		struct commit_list * parents;
+
+		parents=next_node->visible_parents;
+		while (parents) {
+			struct bisect_by_cut_node * pn = get_bisect_by_cut_node(parents->item);
+
+			if ((next_node->flags & ABOVE) 
+			     && !(pn->flags & ABOVE) 
+			     && (pn->count == 0)) {
+				counter.item=parents->item;
+				pn->count = count_distance(&counter);
+				clear_distance(topological_order);
+				commit_list_insert(parents->item, &work);
+			}
+			parents=parents->next;
+		}
+		next=next->next;
+		next_node++;
+	}
+	/*
+	 * Process the work queue until done.
+	 */
+	while (work) {
+		struct commit * work_item = pop_commit(&work);
+		struct bisect_by_cut_node * wn = get_bisect_by_cut_node(work_item);
+		struct commit_list * children = wn->children;
+		unsigned int goal_test = wn->count;
+	        unsigned int fitness = (goal_test>goal)?(goal_test-goal):(goal-goal_test);
+
+		if (fitness < fittest) {
+			best = work_item;
+			fittest = fitness;
+		}
+#ifdef DEBUG
+		if (debug) {
+			print_bisect_by_cut_node(work_item, "work ");
+		}
+#endif
+		if (goal_test < goal) {
+			while (children) {
+				struct bisect_by_cut_node * cn 
+					= get_bisect_by_cut_node(children->item);
+				if (cn->flags & ABOVE) {
+					/* move the boundary up */
+					cn->flags &= ~ABOVE;
+					if (cn->visible_parents && !cn->visible_parents->next)
+						/* 
+						 * If the child only has one visible parent
+						 * the goal_test increases by one
+						 */
+						cn->count = wn->count+1;
+					else {
+						/*
+						 * Otherwise, we need to count it explicitly.
+						 */
+						counter.item=children->item;
+						cn->count = count_distance(&counter);
+						clear_distance(topological_order);
+					}
+					commit_list_insert(children->item, &work);
+				}
+				children=children->next;
+			}
+		} else if (goal_test > goal)
+			continue;
+		else 
+			break; /* can't do better than this */
+	}
+	if (work)
+		free_commit_list(work);
+	/*
+	 * reset the util pointers.
+	 */
+	next=topological_order;
+	while (next) {
+		next->item->object.util=NULL;
+		next = next->next;
+	}
+	free(nodes);
+	return best;
+}
+
 int main(int argc, char **argv)
 {
 	struct commit_list *list = NULL;
@@ -440,6 +651,10 @@ int main(int argc, char **argv)
 			show_parents = 1;
 			continue;
 		}
+		if (!strcmp(arg, "--bisect-by-cut")) {
+			bisect_by_cut_option = 1;
+			continue;
+		}
 		if (!strcmp(arg, "--bisect")) {
 			bisect_list = 1;
 			continue;
@@ -458,7 +673,12 @@ int main(int argc, char **argv)
 			show_breaks = 1;
 			continue;
 		}
-
+#ifdef DEBUG
+		if (!strcmp(arg, "--debug")) {
+			debug = 1;
+			continue;
+		}
+#endif
 		flags = 0;
 		if (*arg == '^') {
 			flags = UNINTERESTING;
@@ -476,12 +696,20 @@ int main(int argc, char **argv)
 	if (!merge_order) {		
 	        if (limited)
 			list = limit_list(list);
-		show_commit_list(list);
+		if (!bisect_by_cut_option) 
+			show_commit_list(list);
+		else {
+			sort_in_topological_order(&list);
+			show_commit(bisect_by_cut(list));
+		}
 	} else {
 		if (sort_list_in_merge_order(list, &process_commit)) {
 			  die("merge order sort failed\n");
 		}
 	}
-
+#ifdef DEBUG
+	if (debug) 
+		fprintf(stderr, "complexity=%d\n", complexity);
+#endif
 	return 0;
 }
------------

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [EXPERIMENTAL PATCH] O(n) bisection algorithm
  2005-06-30  8:11 [EXPERIMENTAL PATCH] O(n) bisection algorithm Jon Seymour
@ 2005-06-30  9:03 ` Jon Seymour
  0 siblings, 0 replies; 2+ messages in thread
From: Jon Seymour @ 2005-06-30  9:03 UTC (permalink / raw)
  To: git; +Cc: Linus Torvalds

BTW: just in case you are wondering what these variables are for and
why they and the associated code is there...

> +static struct commit_list * topological_order = NULL;
> +static struct commit_list ** t_o_tail = &topological_order;

They are not required. This was a remnant of a previous version that
used the --merge-order code to obtain the topological order. Of
course, the final submission will remove this redundant code.

jon.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2005-06-30  8:56 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-06-30  8:11 [EXPERIMENTAL PATCH] O(n) bisection algorithm Jon Seymour
2005-06-30  9:03 ` Jon Seymour

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).