All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeff King <peff@peff.net>
To: "René Scharfe" <l.s.r@web.de>
Cc: git@vger.kernel.org, Michael Haggerty <mhagger@alum.mit.edu>
Subject: Re: [PATCH 0/6] address packed-refs speed regressions
Date: Sun, 5 Apr 2015 14:59:12 -0400	[thread overview]
Message-ID: <20150405185911.GA19902@peff.net> (raw)
In-Reply-To: <20150405185259.GB13096@peff.net>

On Sun, Apr 05, 2015 at 02:52:59PM -0400, Jeff King wrote:

> Right now we parse all of the packed-refs file into an in-memory cache,
> and then do single lookups from that cache. Doing an mmap() and a binary
> search is way faster (and costs less memory) for doing individual
> lookups. It relies on the list being sorted. This is generally true, but
> not something we currently rely on (however, it would be easy to add a
> "sorted" flag to top of the file and have the readers fall back when the
> flag is missing). I've played with a patch to do this (it's not entirely
> trivial, because you jump into the middle of a line, and then have to
> walk backwards to find the start of the record).
> 
> For traversals, it's more complicated. Obviously if you are traversing
> all refs, you have to read the whole thing anyway. If you are traversing
> a subset of the refs, you can binary-search the start of the subset, and
> then walk forward. But that's where it gets tricky with the current
> code.

In case you are curious, here is my proof-of-concept for the packed-refs
binary search. You'll note that it's a separate program, and not
integrated into refs.c. I wrote this last August, and after trying to
integrate it into refs.c, I found the ref_cache problems I described,
and I haven't touched it since.

I also seem to have saved the patch for stuffing it into refs.c, but I
am not sure if it even compiles (I wrote only "horrible wip" in the
commit message ;) ).

-- >8 --
Subject: [PATCH] add git-quick-list

This is a proof of concept for binary-searching the
packed-refs file in order to traverse an ordered subset of
it. Note that it _only_ reads the packed-refs file
currently. To really compare to for-each-ref, it would need
to also walk the loose ref area for its prefix. On a
mostly-packed repository that shouldn't make a big speed
difference, though.

And of course we don't _really_ want a separate command here
at all. This should be part of refs.c, and everyone who
calls for_each_ref should benefit from it.

Still, the numbers are promising. Here's are comparisons
against for-each-ref on torvalds/linux, which has a 218M
packed-refs file:

  $ time git for-each-ref \
      --format='%(objectname) %(refname)' \
      refs/remotes/2325298/ |
      wc -c
  44139

  real    0m1.649s
  user    0m1.332s
  sys     0m0.304s

  $ time ~peff/git-quick-list refs/remotes/2325298/ | wc -c
  44139

  real    0m0.012s
  user    0m0.004s
  sys     0m0.004s
---
 Makefile     |   1 +
 quick-list.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+)
 create mode 100644 quick-list.c

diff --git a/Makefile b/Makefile
index 2457065..aa32598 100644
--- a/Makefile
+++ b/Makefile
@@ -541,6 +541,7 @@ PROGRAM_OBJS += shell.o
 PROGRAM_OBJS += show-index.o
 PROGRAM_OBJS += upload-pack.o
 PROGRAM_OBJS += remote-testsvn.o
+PROGRAM_OBJS += quick-list.o
 
 # Binary suffix, set to .exe for Windows builds
 X =
diff --git a/quick-list.c b/quick-list.c
new file mode 100644
index 0000000..e423f1f
--- /dev/null
+++ b/quick-list.c
@@ -0,0 +1,174 @@
+#include "cache.h"
+#include "refs.h"
+
+struct packed_refs_iterator {
+	const char *start;
+	const char *end;
+
+	const char *cur;
+	const char *ref;
+	const char *eol;
+	const char *next;
+};
+
+static void iterator_init(struct packed_refs_iterator *pos,
+			  const char *buf, size_t len)
+{
+	pos->start = buf;
+	pos->end = buf + len;
+
+	/* skip past header line */
+	if (pos->start < pos->end && *pos->start == '#') {
+		while (pos->start < pos->end && *pos->start != '\n')
+			pos->start++;
+		if (pos->start < pos->end)
+			pos->start++;
+	}
+}
+
+static int iterator_cmp(const char *key, struct packed_refs_iterator *pos)
+{
+	const char *ref = pos->ref;
+	for (; *key && ref < pos->eol; key++, ref++)
+		if (*key != *ref)
+			return (unsigned char)*key - (unsigned char)*ref;
+	return ref == pos->eol ? *key ? 1 : 0 : -1;
+}
+
+static const char *find_eol(const char *p, const char *end)
+{
+	p = memchr(p, '\n', end - p);
+	return p ? p : end;
+}
+
+static void parse_line(struct packed_refs_iterator *pos, const char *p)
+{
+	pos->cur = p;
+	if (pos->end - p < 41)
+		die("truncated packed-refs file");
+	p += 41;
+
+	pos->ref = p;
+	pos->eol = p = find_eol(p, pos->end);
+
+	/* skip newline, and then past any peel records */
+	if (p < pos->end)
+		p++;
+	while (p < pos->end && *p == '^') {
+		p = find_eol(p, pos->end);
+		if (p < pos->end)
+			p++;
+	}
+	pos->next = p;
+}
+
+static void iterator_next(struct packed_refs_iterator *pos)
+{
+	if (pos->next < pos->end)
+		parse_line(pos, pos->next);
+	else
+		pos->cur = NULL;
+}
+
+static void iterator_start(struct packed_refs_iterator *pos, const char *prefix)
+{
+	const char *lo = pos->start, *hi = pos->end;
+
+	while (lo < hi) {
+		const char *mi = lo + ((hi - lo) / 2);
+		int cmp;
+
+		/*
+		 * We landed somewhere on a line. Walk back to find
+		 * the start of the line.
+		 */
+		while (mi > lo && *(mi-1) != '\n')
+			mi--;
+
+		/*
+		 * We may have hit a peel-line. In that case, try
+		 * to walk back to the actual ref line (and skip as
+		 * many peel lines as we find, for future-proofing).
+		 */
+		while (*mi == '^') {
+			if (mi == lo)
+				die("peel line without a record before it?");
+			mi--;
+			if (mi == lo)
+				die("peel line with bare newline before it?");
+			mi--;
+			while (mi > lo && *(mi-1) != '\n')
+				mi--;
+		}
+
+		/* Now we should be at a real ref line. */
+		parse_line(pos, mi);
+		cmp = iterator_cmp(prefix, pos);
+		if (!cmp)
+			return;
+		else if (cmp < 0)
+			hi = pos->cur;
+		else
+			lo = pos->next;
+	}
+
+	if (hi < pos->end)
+		parse_line(pos, hi);
+	else
+		pos->cur = NULL;
+}
+
+static void quick_list(const char *prefix, each_ref_fn fn, void *data)
+{
+	int fd = open(git_path("packed-refs"), O_RDONLY);
+	struct stat st;
+	const char *buf = NULL;
+	size_t len;
+	struct packed_refs_iterator pos;
+
+	if (fd < 0)
+		goto out;
+	if (fstat(fd, &st) < 0)
+		goto out;
+	len = xsize_t(st.st_size);
+	buf = xmmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (!buf)
+		goto out;
+
+	iterator_init(&pos, buf, len);
+	for (iterator_start(&pos, prefix);
+	     pos.cur && starts_with(pos.ref, prefix);
+	     iterator_next(&pos)) {
+		unsigned char sha1[20];
+		char *refname;
+
+		if (get_sha1_hex(pos.cur, sha1) < 0)
+			die("packed-refs contained invalid sha1");
+		refname = xmemdupz(pos.ref, pos.eol - pos.ref);
+		fn(refname, sha1, 0, data);
+		free(refname);
+	}
+
+out:
+	close(fd);
+	if (buf)
+		munmap((void *)buf, len);
+}
+
+static int show_ref(const char *refname, const unsigned char *sha1,
+		    int flags, void *data)
+{
+	printf("%s %s\n", sha1_to_hex(sha1), refname);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	if (argc != 2)
+		usage("git quick-list <prefix>");
+
+	setup_git_directory();
+	quick_list(argv[1], show_ref, NULL);
+
+	return 0;
+}
-- 
2.4.0.rc0.363.gf9f328b

  reply	other threads:[~2015-04-05 18:59 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-04-05  1:06 [PATCH 0/6] address packed-refs speed regressions Jeff King
2015-04-05  1:07 ` [PATCH 1/6] strbuf_getwholeline: use getc macro Jeff King
2015-04-05  1:08 ` [PATCH 2/6] git-compat-util: add fallbacks for unlocked stdio Jeff King
2015-04-05  1:11 ` [PATCH 3/6] strbuf_getwholeline: use getc_unlocked Jeff King
2015-04-05  4:56   ` Jeff King
2015-04-05  5:27     ` Jeff King
2015-04-05  5:35       ` Jeff King
2015-04-05 20:49         ` Junio C Hamano
2015-04-05 14:36     ` Duy Nguyen
2015-04-05 18:24       ` Jeff King
2015-04-05 20:09     ` Junio C Hamano
2015-04-07 13:48     ` Rasmus Villemoes
2015-04-07 19:04       ` Jeff King
2015-04-07 22:43         ` Rasmus Villemoes
2015-04-08  0:17           ` Jeff King
2015-04-05  1:11 ` [PATCH 4/6] strbuf: add an optimized 1-character strbuf_grow Jeff King
2015-04-06  2:13   ` Eric Sunshine
2015-04-06  5:05     ` Jeff King
2015-04-05  1:11 ` [PATCH 5/6] t1430: add another refs-escape test Jeff King
2015-04-05  1:15 ` [PATCH 6/6] refname_is_safe: avoid expensive normalize_path_copy call Jeff King
2015-04-05 13:41 ` [PATCH 0/6] address packed-refs speed regressions René Scharfe
2015-04-05 18:52   ` Jeff King
2015-04-05 18:59     ` Jeff King [this message]
2015-04-05 23:04       ` René Scharfe
2015-04-05 22:39     ` René Scharfe
2015-04-06  4:49       ` Jeff King
2015-04-16  8:47 ` [PATCH v2 0/9] " Jeff King
2015-04-16  8:48   ` [PATCH 1/9] strbuf_getwholeline: use getc macro Jeff King
2015-04-16  8:48   ` [PATCH 2/9] git-compat-util: add fallbacks for unlocked stdio Jeff King
2015-04-16  8:49   ` [PATCH 3/9] strbuf_getwholeline: use getc_unlocked Jeff King
2015-04-16  8:51   ` [PATCH 4/9] config: use getc_unlocked when reading from file Jeff King
2015-04-16  8:53   ` [PATCH 5/9] strbuf_addch: avoid calling strbuf_grow Jeff King
2015-04-16  8:58   ` [PATCH 6/9] strbuf_getwholeline: " Jeff King
2015-04-16  9:01   ` [PATCH 7/9] strbuf_getwholeline: use getdelim if it is available Jeff King
2015-04-17 10:16     ` Eric Sunshine
2015-04-21 23:09       ` Jeff King
2015-05-08 23:56         ` Eric Sunshine
2015-05-09  1:09           ` Jeff King
2015-06-02 18:22             ` Eric Sunshine
2015-04-22 18:00       ` Johannes Schindelin
2015-04-22 18:06         ` Jeff King
2015-04-16  9:03   ` [PATCH 8/9] read_packed_refs: avoid double-checking sane refs Jeff King
2015-04-16  9:04   ` [PATCH 9/9] t1430: add another refs-escape test Jeff King
2015-04-16  9:25   ` [PATCH v2 0/9] address packed-refs speed regressions Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150405185911.GA19902@peff.net \
    --to=peff@peff.net \
    --cc=git@vger.kernel.org \
    --cc=l.s.r@web.de \
    --cc=mhagger@alum.mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.