All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Turner <dturner@twopensource.com>
To: git@vger.kernel.org, mhagger@alum.mit.edu,
	chriscool@tuxfamily.org, pclouds@gmail.com
Cc: David Turner <dturner@twopensource.com>
Subject: [PATCH v3 2/4] path: optimize common dir checking
Date: Wed, 12 Aug 2015 17:57:23 -0400	[thread overview]
Message-ID: <1439416645-19173-2-git-send-email-dturner@twopensource.com> (raw)
In-Reply-To: <1439416645-19173-1-git-send-email-dturner@twopensource.com>

Instead of a linear search over common_list to check whether
a path is common, use a trie.  The trie search operates on
path prefixes, and handles excludes.

Signed-off-by: David Turner <dturner@twopensource.com>
---

Probably overkill, but maybe we could later use it for making exclude
or sparse-checkout matching faster (or maybe we have to go all the way
to McNaughton-Yamada for that to be truly worthwhile).

---
 path.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 201 insertions(+), 14 deletions(-)

diff --git a/path.c b/path.c
index 236f797..21a4ce7 100644
--- a/path.c
+++ b/path.c
@@ -121,25 +121,212 @@ struct common_dir common_list[] = {
 	{ NULL, 0, 0, 0 }
 };
 
-static void update_common_dir(struct strbuf *buf, int git_dir_len)
+/*
+ * A compressed trie.  A trie node consists of zero or more characters that
+ * are common to all elements with this prefix, optionally followed by some
+ * children.  If value is not NULL, the trie node is a terminal node.
+ *
+ * For example, consider the following set of strings:
+ * abc
+ * def
+ * definite
+ * definition
+ *
+ * The trie would look look like:
+ * root: len = 0, value = (something), children a and d non-NULL.
+ *    a: len = 2, contents = bc
+ *    d: len = 2, contents = ef, children i non-NULL, value = (something)
+ *       i: len = 3, contents = nit, children e and i non-NULL, value = NULL
+ *           e: len = 0, children all NULL, value = (something)
+ *           i: len = 2, contents = on, children all NULL, value = (something)
+ */
+struct trie {
+	struct trie *children[256];
+	int len;
+	char *contents;
+	void *value;
+};
+
+static struct trie *make_trie_node(const char *key, void *value)
 {
-	char *base = buf->buf + git_dir_len;
-	const struct common_dir *p;
+	struct trie *new_node = xcalloc(1, sizeof(*new_node));
+	new_node->len = strlen(key);
+	if (new_node->len) {
+		new_node->contents = xmalloc(new_node->len);
+		memcpy(new_node->contents, key, new_node->len);
+	}
+	new_node->value = value;
+	return new_node;
+}
 
-	if (is_dir_file(base, "logs", "HEAD") ||
-	    is_dir_file(base, "info", "sparse-checkout"))
-		return;	/* keep this in $GIT_DIR */
-	for (p = common_list; p->dirname; p++) {
-		const char *path = p->dirname;
-		if (p->is_dir && dir_prefix(base, path)) {
-			replace_dir(buf, git_dir_len, get_git_common_dir());
-			return;
+/*
+ * Add a key/value pair to a trie.  The key is assumed to be \0-terminated.
+ * If there was an existing value for this key, return it.
+ */
+static void *add_to_trie(struct trie *root, const char *key, void *value)
+{
+	struct trie *child;
+	void *old;
+	int i;
+
+	if (!*key) {
+		/* we have reached the end of the key */
+		old = root->value;
+		root->value = value;
+		return old;
+	}
+
+	for (i = 0; i < root->len; ++i) {
+		if (root->contents[i] == key[i])
+			continue;
+
+		/*
+		 * Split this node: child will contain this node's
+		 * existing children.
+		 */
+		child = malloc(sizeof(*child));
+		memcpy(child->children, root->children, sizeof(root->children));
+
+		child->len = root->len - i - 1;
+		if (child->len) {
+			child->contents = strndup(root->contents + i + 1,
+						   child->len);
 		}
-		if (!p->is_dir && !strcmp(base, path)) {
-			replace_dir(buf, git_dir_len, get_git_common_dir());
-			return;
+		child->value = root->value;
+		root->value = NULL;
+		root->len = i;
+
+		memset(root->children, 0, sizeof(root->children));
+		root->children[(unsigned char)root->contents[i]] = child;
+
+		/* This is the newly-added child. */
+		root->children[(unsigned char)key[i]] =
+			make_trie_node(key + i + 1, value);
+		return NULL;
+	}
+
+	/* We have matched the entire compressed section */
+	if (key[i]) {
+		child = root->children[(unsigned char)key[root->len]];
+		if (child) {
+			return add_to_trie(child, key + root->len + 1, value);
+		} else {
+			child = make_trie_node(key + root->len + 1, value);
+			root->children[(unsigned char)key[root->len]] = child;
+			return NULL;
 		}
 	}
+
+	old = root->value;
+	root->value = value;
+	return old;
+}
+
+typedef int (*match_fn)(const char *unmatched, void *data, void *baton);
+
+/*
+ * Search a trie for some key.  Find the longest /-or-\0-terminated
+ * prefix of the key for which the trie contains a value.  Call fn
+ * with the unmatched portion of the key and the found value, and
+ * return its return value.  If there is no such prefix, return -1.
+ *
+ * For example, consider the trie containing only [refs,
+ * refs/worktree] (both with values).
+ *
+ * | key             | unmatched  | val from node | return value |
+ * |-----------------|------------|---------------|--------------|
+ * | a               | not called | n/a           | -1           |
+ * | refs            | \0         | refs          | as per fn    |
+ * | refs/           | /          | refs          | as per fn    |
+ * | refs/w          | /w         | refs          | as per fn    |
+ * | refs/worktree   | \0         | refs/worktree | as per fn    |
+ * | refs/worktree/  | /          | refs/worktree | as per fn    |
+ * | refs/worktree/a | /a         | refs/worktree | as per fn    |
+ * |-----------------|------------|---------------|--------------|
+ *
+ */
+static int trie_find(struct trie *root, const char *key, match_fn fn,
+		     void *baton)
+{
+	int i;
+	int result;
+	struct trie *child;
+
+	if (!*key) {
+		/* we have reached the end of the key */
+		if (root->value && !root->len)
+			return fn(key, root->value, baton);
+		else
+			return -1;
+	}
+
+	for (i = 0; i < root->len; ++i) {
+		if (root->contents[i] != key[i])
+			return -1;
+	}
+
+	/* Matched the entire compressed section */
+	key += i;
+	if (!*key)
+		/* End of key */
+		return fn(key, root->value, baton);
+
+	child = root->children[(unsigned char)*key];
+	if (child)
+		result = trie_find(child, key + 1, fn, baton);
+	else
+		result = -1;
+
+	if (result >= 0 || (*key != '/' && *key != 0))
+		return result;
+	if (root->value)
+		return fn(key, root->value, baton);
+	else
+		return -1;
+}
+
+static struct trie common_trie;
+static int common_trie_done_setup;
+
+static void init_common_trie(void)
+{
+	struct common_dir *p;
+
+	if (common_trie_done_setup)
+		return;
+
+	for (p = common_list; p->dirname; p++)
+		add_to_trie(&common_trie, p->dirname, p);
+
+	common_trie_done_setup = 1;
+}
+
+/*
+ * Helper function for update_common_dir: returns 1 if the dir
+ * prefix is common.
+ */
+static int check_common(const char *unmatched, void *value, void *baton)
+{
+	struct common_dir *dir = value;
+
+	if (!dir)
+		return 0;
+
+	if (dir->is_dir && (unmatched[0] == 0 || unmatched[0] == '/'))
+		return !dir->exclude;
+
+	if (!dir->is_dir && unmatched[0] == 0)
+		return !dir->exclude;
+
+	return 0;
+}
+
+static void update_common_dir(struct strbuf *buf, int git_dir_len)
+{
+	char *base = buf->buf + git_dir_len;
+	init_common_trie();
+	if (trie_find(&common_trie, base, check_common, NULL) > 0)
+		replace_dir(buf, git_dir_len, get_git_common_dir());
 }
 
 void report_linked_checkout_garbage(void)
-- 
2.0.4.315.gad8727a-twtrsrc

  reply	other threads:[~2015-08-12 21:58 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-08-12 21:57 [PATCH v3 1/4] refs: clean up common_list David Turner
2015-08-12 21:57 ` David Turner [this message]
2015-08-12 22:48   ` [PATCH v3 2/4] path: optimize common dir checking Junio C Hamano
2015-08-13  9:05   ` Michael Haggerty
2015-08-14 17:04     ` Junio C Hamano
2015-08-14 20:04       ` David Turner
2015-08-14 20:27         ` Junio C Hamano
2015-08-14 20:54           ` David Turner
2015-08-15 18:20         ` Michael Haggerty
2015-08-15 18:12       ` Michael Haggerty
2015-08-17 15:55         ` Junio C Hamano
2015-08-15  7:59   ` Duy Nguyen
2015-08-16  5:04     ` David Turner
2015-08-16 12:20       ` Duy Nguyen
2015-08-12 21:57 ` [PATCH v3 3/4] refs: make refs/worktree/* per-worktree David Turner
2015-08-13 17:15   ` Eric Sunshine
2015-08-13 17:41     ` David Turner
2015-08-13 20:16       ` Michael Haggerty
2015-08-13 20:32         ` David Turner
2015-08-14  8:18           ` Michael Haggerty
2015-08-14 17:10             ` Junio C Hamano
2015-08-15  8:04   ` Duy Nguyen
2015-08-12 21:57 ` [PATCH v3 4/4] bisect: make bisection refs per-worktree David Turner
2015-08-15  7:44 ` [PATCH v3 1/4] refs: clean up common_list Duy Nguyen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1439416645-19173-2-git-send-email-dturner@twopensource.com \
    --to=dturner@twopensource.com \
    --cc=chriscool@tuxfamily.org \
    --cc=git@vger.kernel.org \
    --cc=mhagger@alum.mit.edu \
    --cc=pclouds@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.