git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jeff King <peff@peff.net>
To: Thomas Rast <trast@inf.ethz.ch>
Cc: git@vger.kernel.org, "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>,
	"Junio C Hamano" <gitster@pobox.com>
Subject: [PATCH] process tree diffs during "rev-list --objects"
Date: Wed, 1 May 2013 20:53:17 -0400	[thread overview]
Message-ID: <20130502005317.GA29159@sigill.intra.peff.net> (raw)
In-Reply-To: <20130501204947.GA12789@sigill.intra.peff.net>

On Wed, May 01, 2013 at 04:49:47PM -0400, Jeff King wrote:

> Another avenue I'd like to explore is actually doing a tree-diff from
> the last processed commit, since we should need to examine only the
> changed entries. I suspect it won't be a big benefit, though, because
> even though the tree diff can happen in O(# of entries), we are trying
> to beat doing an O(1) hash for each entry. So it's the same algorithmic
> complexity, and it is hard to beat a few hashcmp() calls. We'll see.

As I suspected, this is not a win:

Test                               origin            this tree
--------------------------------------------------------------------------
0001.1: rev-list --all             0.40(0.37+0.02)   0.40(0.38+0.01) +0.0%
0001.2: rev-list --all --objects   2.22(2.16+0.05)   2.41(2.16+0.24) +8.6%

I've included the patch below for reference. It really does work as
planned; the number of calls to lookup_object for doing "rev-list --all
--objects" on git.git is reduced from ~15M to ~2.6M. But the benefit is
eaten up by the extra tree-processing time. For fun, here's an excerpt
from the "perf diff" between the two versions:

            +10.51%  git                 [.] update_tree_entry
             +8.97%  git                 [.] lookup_object
             +7.81%  git                 [.] process_tree
             +5.20%  libc-2.13.so        [.] __memcmp_sse4_1
             +3.77%  libc-2.13.so        [.] __strlen_sse42
             +2.69%  git                 [.] base_name_compare
             [....]
             -7.90%  git                 [.] tree_entry
            -39.19%  git                 [.] lookup_object

Shawn had suggested trying to store the tree deltas in such a way that
these tree comparisons could be done more cheaply. That might tip things
in favor of the tree-diff approach.

-Peff

---
diff --git a/list-objects.c b/list-objects.c
index 3dd4a96..b87a049 100644
--- a/list-objects.c
+++ b/list-objects.c
@@ -59,8 +59,57 @@ static void process_tree(struct rev_info *revs,
 	/* Nothing to do */
 }
 
+static struct tree *skip_processed_entries(struct tree_desc *cur,
+					   struct tree_desc *old)
+{
+	while (cur->size && old->size) {
+		int cmp = base_name_compare(cur->entry.path,
+					    tree_entry_len(&cur->entry),
+					    0, /* ignore mode */
+					    old->entry.path,
+					    tree_entry_len(&old->entry),
+					    0);
+		if (cmp > 0) {
+			/*
+			 * Old tree has something we do not; ignore it, as it
+			 * was already processed.
+			 */
+			update_tree_entry(old);
+		}
+		else if (cmp == 0) {
+			/*
+			 * We have the same path; if the sha1s match, then we
+			 * have already processed it and can ignore. Otherwise,
+			 * return for processing, but also provide the old
+			 * tree so that we can recurse.
+			 */
+			if (!hashcmp(cur->entry.sha1, old->entry.sha1)) {
+				update_tree_entry(cur);
+				update_tree_entry(old);
+			}
+			else {
+				struct object *o = lookup_object(old->entry.sha1);
+				update_tree_entry(old);
+				if (o && o->type == OBJ_TREE)
+					return (struct tree *)o;
+				else
+					return NULL;
+			}
+		}
+		else {
+			/*
+			 * We have an entry the old one does not; we must look
+			 * at it (and there is no matching old tree to report).
+			 */
+			return NULL;
+		}
+	}
+	return NULL;
+}
+
 static void process_tree(struct rev_info *revs,
 			 struct tree *tree,
+			 struct tree *old_tree,
 			 show_object_fn show,
 			 struct name_path *path,
 			 struct strbuf *base,
@@ -68,7 +117,7 @@ static void process_tree(struct rev_info *revs,
 			 void *cb_data)
 {
 	struct object *obj = &tree->object;
-	struct tree_desc desc;
+	struct tree_desc desc, old_desc;
 	struct name_entry entry;
 	struct name_path me;
 	enum interesting match = revs->diffopt.pathspec.nr == 0 ?
@@ -97,7 +146,18 @@ static void process_tree(struct rev_info *revs,
 
 	init_tree_desc(&desc, tree->buffer, tree->size);
 
-	while (tree_entry(&desc, &entry)) {
+	if (old_tree)
+		init_tree_desc(&old_desc, old_tree->buffer, old_tree->size);
+	else
+		init_tree_desc(&old_desc, NULL, 0);
+
+	while (desc.size) {
+		struct tree *old_tree;
+
+		old_tree = skip_processed_entries(&desc, &old_desc);
+		if (!tree_entry(&desc, &entry))
+			break;
+
 		if (match != all_entries_interesting) {
 			match = tree_entry_interesting(&entry, base, 0,
 						       &revs->diffopt.pathspec);
@@ -109,7 +169,7 @@ static void process_tree(struct rev_info *revs,
 
 		if (S_ISDIR(entry.mode))
 			process_tree(revs,
-				     lookup_tree(entry.sha1),
+				     lookup_tree(entry.sha1), old_tree,
 				     show, &me, base, entry.path,
 				     cb_data);
 		else if (S_ISGITLINK(entry.mode))
@@ -123,8 +183,6 @@ static void process_tree(struct rev_info *revs,
 				     cb_data);
 	}
 	strbuf_setlen(base, baselen);
-	free(tree->buffer);
-	tree->buffer = NULL;
 }
 
 static void mark_edge_parents_uninteresting(struct commit *commit,
@@ -173,6 +231,7 @@ void traverse_commit_list(struct rev_info *revs,
 	int i;
 	struct commit *commit;
 	struct strbuf base;
+	struct tree *last_root_tree = NULL;
 
 	strbuf_init(&base, PATH_MAX);
 	while ((commit = get_revision(revs)) != NULL) {
@@ -196,8 +255,9 @@ void traverse_commit_list(struct rev_info *revs,
 			continue;
 		}
 		if (obj->type == OBJ_TREE) {
-			process_tree(revs, (struct tree *)obj, show_object,
-				     NULL, &base, name, data);
+			process_tree(revs, (struct tree *)obj, last_root_tree,
+				     show_object, NULL, &base, name, data);
+			last_root_tree = (struct tree *)obj;
 			continue;
 		}
 		if (obj->type == OBJ_BLOB) {

  reply	other threads:[~2013-05-02  0:53 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-04-25 18:04 [PATCH] Hold an 'unsigned long' chunk of the sha1 in obj_hash Thomas Rast
2013-04-25 20:13 ` Junio C Hamano
2013-04-25 21:09   ` Thomas Rast
2013-04-25 21:12 ` Duy Nguyen
2013-05-01 20:49 ` Jeff King
2013-05-02  0:53   ` Jeff King [this message]
2013-05-02  8:52   ` Thomas Rast

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130502005317.GA29159@sigill.intra.peff.net \
    --to=peff@peff.net \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=pclouds@gmail.com \
    --cc=trast@inf.ethz.ch \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).