[PATCH 1/3] Lazily open pack index files on demand

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/3] Lazily open pack index files on demand
@ 2007-05-26  5:24 Shawn O. Pearce
  2007-05-26  8:29 ` Junio C Hamano
  0 siblings, 1 reply; 26+ messages in thread
From: Shawn O. Pearce @ 2007-05-26  5:24 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Dana How

In some repository configurations the user may have many packfiles,
but all of the recent commits/trees/tags/blobs are likely to
be in the most recent packfile (the one with the newest mtime).
It is therefore common to be able to complete an entire operation
by accessing only one packfile, even if there are 25 packfiles
available to the repository.

Rather than opening and mmaping the corresponding .idx file for
every pack found, we now only open and map the .idx when we suspect
there might be an object of interest in there.

Of course we cannot known in advance which packfile contains an
object, so we still need to scan the entire packed_git list to
locate anything.  But odds are users want to access objects in the
most recently created packfiles first, and that may be all they
ever need for the current operation.

Junio observed in b867092f that placing recent packfiles before
older ones can slightly improve access times for recent objects,
without degrading it for historical object access.

This change improves upon Junio's observations by trying even harder
to avoid the .idx files that we won't need.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---

 This conflicts (in a subtle way) with Dana How's
 "sha1_file.c:rearrange_packed_git() should consider packs' object
 sizes" patch as we now have num_objects = 0 for any indexes we
 have not opened.  In the case of Dana's patch this would cause
 those packfiles to have very high ranks, possibly sorting much
 later than they should have.

 builtin-count-objects.c |    2 ++
 cache.h                 |    3 ++-
 pack-check.c            |    9 +++++++--
 pack-redundant.c        |    3 +++
 sha1_file.c             |   38 +++++++++++++++++++++++++++++++++++---
 5 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/builtin-count-objects.c b/builtin-count-objects.c
index ff90ebd..ac65e03 100644
--- a/builtin-count-objects.c
+++ b/builtin-count-objects.c
@@ -111,6 +111,8 @@ int cmd_count_objects(int ac, const char **av, const char *prefix)
 		for (p = packed_git; p; p = p->next) {
 			if (!p->pack_local)
 				continue;
+			if (!p->index_data && open_pack_index(p))
+				continue;
 			packed += p->num_objects;
 			num_pack++;
 		}
diff --git a/cache.h b/cache.h
index cd875bc..0f4a05b 100644
--- a/cache.h
+++ b/cache.h
@@ -485,10 +485,11 @@ extern struct packed_git *find_sha1_pack(const unsigned char *sha1,
 					 struct packed_git *packs);
 
 extern void pack_report(void);
+extern int open_pack_index(struct packed_git *);
 extern unsigned char* use_pack(struct packed_git *, struct pack_window **, off_t, unsigned int *);
 extern void unuse_pack(struct pack_window **);
 extern struct packed_git *add_packed_git(const char *, int, int);
-extern const unsigned char *nth_packed_object_sha1(const struct packed_git *, uint32_t);
+extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
 extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
 extern unsigned long unpack_object_header_gently(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
diff --git a/pack-check.c b/pack-check.c
index d04536b..7475348 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -129,12 +129,17 @@ static void show_pack_info(struct packed_git *p)
 
 int verify_pack(struct packed_git *p, int verbose)
 {
-	off_t index_size = p->index_size;
-	const unsigned char *index_base = p->index_data;
+	off_t index_size;
+	const unsigned char *index_base;
 	SHA_CTX ctx;
 	unsigned char sha1[20];
 	int ret;
 
+	if (open_pack_index(p))
+		return error("packfile %s index not opened", p->pack_name);
+	index_size = p->index_size;
+	index_base = p->index_data;
+
 	ret = 0;
 	/* Verify SHA1 sum of the index file */
 	SHA1_Init(&ctx);
diff --git a/pack-redundant.c b/pack-redundant.c
index 87077e1..0617320 100644
--- a/pack-redundant.c
+++ b/pack-redundant.c
@@ -550,6 +550,9 @@ static struct pack_list * add_pack(struct packed_git *p)
 	l.pack = p;
 	llist_init(&l.all_objects);
 
+	if (!p->index_data && open_pack_index(p))
+		return NULL;
+
 	base = p->index_data;
 	base += 256 * 4 + ((p->index_version < 2) ? 4 : 8);
 	step = (p->index_version < 2) ? 24 : 20;
diff --git a/sha1_file.c b/sha1_file.c
index 12d2ef2..6a5ba63 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -530,6 +530,21 @@ static int check_packed_git_idx(const char *path,  struct packed_git *p)
 	return 0;
 }
 
+int open_pack_index (struct packed_git *p)
+{
+	char *idx_name;
+	int ret;
+
+	if (p->index_data)
+		return 0;
+
+	idx_name = xstrdup(p->pack_name);
+	strcpy(idx_name + strlen(idx_name) - strlen(".pack"), ".idx");
+	ret = check_packed_git_idx(idx_name, p);
+	free(idx_name);
+	return ret;
+}
+
 static void scan_windows(struct packed_git *p,
 	struct packed_git **lru_p,
 	struct pack_window **lru_w,
@@ -605,6 +620,9 @@ static int open_packed_git_1(struct packed_git *p)
 	unsigned char *idx_sha1;
 	long fd_flag;
 
+	if (!p->index_data && open_pack_index(p))
+		return error("packfile %s index unavailable", p->pack_name);
+
 	p->pack_fd = open(p->pack_name, O_RDONLY);
 	if (p->pack_fd < 0 || fstat(p->pack_fd, &st))
 		return -1;
@@ -757,8 +775,7 @@ struct packed_git *add_packed_git(const char *path, int path_len, int local)
 		return NULL;
 	memcpy(p->pack_name, path, path_len);
 	strcpy(p->pack_name + path_len, ".pack");
-	if (stat(p->pack_name, &st) || !S_ISREG(st.st_mode) ||
-	    check_packed_git_idx(path, p)) {
+	if (stat(p->pack_name, &st) || !S_ISREG(st.st_mode)) {
 		free(p);
 		return NULL;
 	}
@@ -766,6 +783,10 @@ struct packed_git *add_packed_git(const char *path, int path_len, int local)
 	/* ok, it looks sane as far as we can check without
 	 * actually mapping the pack file.
 	 */
+	p->index_version = 0;
+	p->index_data = NULL;
+	p->index_size = 0;
+	p->num_objects = 0;
 	p->pack_size = st.st_size;
 	p->next = NULL;
 	p->windows = NULL;
@@ -1572,10 +1593,15 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 	return data;
 }
 
-const unsigned char *nth_packed_object_sha1(const struct packed_git *p,
+const unsigned char *nth_packed_object_sha1(struct packed_git *p,
 					    uint32_t n)
 {
 	const unsigned char *index = p->index_data;
+	if (!index) {
+		if (open_pack_index(p))
+			return NULL;
+		index = p->index_data;
+	}
 	if (n >= p->num_objects)
 		return NULL;
 	index += 4 * 256;
@@ -1612,6 +1638,12 @@ off_t find_pack_entry_one(const unsigned char *sha1,
 	const unsigned char *index = p->index_data;
 	unsigned hi, lo;
 
+	if (!index) {
+		if (open_pack_index(p))
+			return 0;
+		level1_ofs = p->index_data;
+		index = p->index_data;
+	}
 	if (p->index_version > 1) {
 		level1_ofs += 2;
 		index += 8;
-- 
1.5.2.789.g8ee1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-26  5:24 [PATCH 1/3] Lazily open pack index files on demand Shawn O. Pearce
@ 2007-05-26  8:29 ` Junio C Hamano
  2007-05-26 17:30   ` Shawn O. Pearce
  2007-05-26 17:31   ` Dana How
  0 siblings, 2 replies; 26+ messages in thread
From: Junio C Hamano @ 2007-05-26  8:29 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git, Dana How

"Shawn O. Pearce" <spearce@spearce.org> writes:

>  This conflicts (in a subtle way) with Dana How's
>  "sha1_file.c:rearrange_packed_git() should consider packs' object
>  sizes" patch as we now have num_objects = 0 for any indexes we
>  have not opened.  In the case of Dana's patch this would cause
>  those packfiles to have very high ranks, possibly sorting much
>  later than they should have.

I am keeping that rearrange stuff on hold, partly because I am
moderately hesitant to do the fp, which feels overkill at that
low level of code.

Also, I am hoping that we can discard that the object density
criteria altogether by making the default repack behaviour
friendlier to the pathological cases, e.g. by emitting huge
blobs at the end of the packstream, potentially pushing it out
to later parts of split packs by themselves and automatically
marking them with the .keep flag.  Until that kind of
improvements materialize, people with pathological cases could
(1) handcraft a pack that contains only megablob, (2) place that
on central alternate, (3) touch it with artificially old
timestamp, which hopefully is a good enough workaround.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-26  8:29 ` Junio C Hamano
@ 2007-05-26 17:30   ` Shawn O. Pearce
  2007-05-26 17:31   ` Dana How
  1 sibling, 0 replies; 26+ messages in thread
From: Shawn O. Pearce @ 2007-05-26 17:30 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Dana How

Junio C Hamano <junkio@cox.net> wrote:
> "Shawn O. Pearce" <spearce@spearce.org> writes:
> 
> >  This conflicts (in a subtle way) with Dana How's
> >  "sha1_file.c:rearrange_packed_git() should consider packs' object
> >  sizes" patch as we now have num_objects = 0 for any indexes we
> >  have not opened.  In the case of Dana's patch this would cause
> >  those packfiles to have very high ranks, possibly sorting much
> >  later than they should have.
> 
> I am keeping that rearrange stuff on hold, partly because I am
> moderately hesitant to do the fp, which feels overkill at that
> low level of code.

Yea, I've actually been having similiar thoughts.

> Also, I am hoping that we can discard that the object density
> criteria altogether by making the default repack behaviour
> friendlier to the pathological cases, e.g. by emitting huge
> blobs at the end of the packstream, potentially pushing it out
> to later parts of split packs by themselves and automatically
> marking them with the .keep flag.  Until that kind of
> improvements materialize, people with pathological cases could
> (1) handcraft a pack that contains only megablob, (2) place that
> on central alternate, (3) touch it with artificially old
> timestamp, which hopefully is a good enough workaround.

Right, I was having the same idea.  If we have pack-objects just
shuffle the really big stuff to the end of its object list they
will naturally fall into the end of the packfile, and the split
out packfiles.  Then if we do the mtime flipping you suggested
earlier right before we exit pack-objects the larger blob packs
will automatically sort behind the smaller commit/tree packs.
No fp needed.

My patch was exactly because I did what you say above; I handcrafted
a pack that contains only large-ish blobs, placed them into a
central repo, and connected it by alternates.  Because of the
local flag logic it is automatically behind my commit/tree pack.
But I also rarely (if ever) have to access that megablob packfile.
Yet the .idx was still being opened.  On Cygwin/Windows that penalty
is high enough to have almost doubled the running time of a simple
"git show".

-- 
Shawn.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-26  8:29 ` Junio C Hamano
  2007-05-26 17:30   ` Shawn O. Pearce
@ 2007-05-26 17:31   ` Dana How
  2007-05-27  2:43     ` Nicolas Pitre
  2007-05-27  3:34     ` Shawn O. Pearce
  1 sibling, 2 replies; 26+ messages in thread
From: Dana How @ 2007-05-26 17:31 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Shawn O. Pearce, git, danahow

On 5/26/07, Junio C Hamano <junkio@cox.net> wrote:
> "Shawn O. Pearce" <spearce@spearce.org> writes:
> >  This conflicts (in a subtle way) with Dana How's
> >  "sha1_file.c:rearrange_packed_git() should consider packs' object
> >  sizes" patch as we now have num_objects = 0 for any indexes we
> >  have not opened.  In the case of Dana's patch this would cause
> >  those packfiles to have very high ranks, possibly sorting much
> >  later than they should have.
> I am keeping that rearrange stuff on hold, partly because I am
> moderately hesitant to do the fp, which feels overkill at that
> low level of code.
Oh,  I thought the fp might cause a gag reflex -- I had to add -lm.
Unfortunately,  when trying to automatically detect and grade outliers,
which is what I was trying to do,  (datum - mean) / std_dev is hard to beat,
and I needed sqrt for std_dev -- all other fp could be easily written out.

> Also, I am hoping that we can discard that the object density
> criteria altogether by making the default repack behaviour
> friendlier to the pathological cases, e.g. by emitting huge
> blobs at the end of the packstream, potentially pushing it out
> to later parts of split packs by themselves and automatically
> marking them with the .keep flag.  Until that kind of
> improvements materialize, people with pathological cases could
> (1) handcraft a pack that contains only megablob, (2) place that
> on central alternate, (3) touch it with artificially old
> timestamp, which hopefully is a good enough workaround.
I think we should do what we can to make the timestamp as
meaningful as possible,  which is why I submitted that stamping patch.

I think there are two interesting strategies compatible
with maximally-informative timestamps:

(1) git-repack -a -d repacks everything on each call.  You would need:
(1a) Rewrite builtin-pack-objects.c so only the object_ix hash
       accesses the "objects" array directly, everything else
       goes through a pointer table.
(1b) Sort the new pointer table by object type,  in order
       tag -> commit -> tree -> nice blob -> naughty blob.
      The sort is stable so the order within each group is unchanged.
(1c) Do not deltify naughty blobs.  Naughty blobs are those
      blobs marked "nodelta" or very large blobs.
(1d) Write out objects in new pointer table order.  Splitting
       will cause metadata to be in first pack,  naughty blobs
       tend to be in the last pack.
(1e) When done writing all packs,  swap their timestamps
      so current timestamp sorting will look at naughty blobs last.

(2) git-repack -a -d runs in two passes and maintains .keep files:
(2a) Add a new flag --types=[gctb]+ to pack-objects to be supplied
      by git-repack.  This means only taGs/Commits/Trees/Blobs
      are to be passed,  all others dropped.
(2b) Put a new loop around the core of git-repack.  In the first iteration,
      pack with --types=b, then with --types=gct in the second.
      Thus metadata will have more recent timestamp.
(2c) If packs are split, also swap timestamps like in (1e),
       within each iteration.
(2d) If an iteration produces split packs, mark all but the last
      in the sequence with a .keep file automatically.  The
      .keep files contain the string "repack".
(2e) Add new option to repack: -A.  If specified,  the first
      thing repack does is remove any keep file containing "repack".
(2f) The existing response of repack to keep files -- do not repack them --
     is retained to ensure on each -a/not -A repack,  we only
     repack the tail of each set of packs: metadata, data.
     The metadata set will probably only ever contain one pack
     and will always be repacked.

I've (badly) implemented (1b) and confirmed it had no impact
on linux-2.6 repo.  I've also implemented (2a), (2b), (2d), and (2f),
but not fully measured them.  I'd like to finish this work,  but
"megapacks" are very time-consuming to manipulate,  and
with the loose megablob approach they are not as useful for me.

Finally,  some people might want more esoteric repacking
strategies than what I've listed above.  We could add a
--packed flag to pack-objects to help them.  This means that
git pack-objects --packed --unpacked=<pack1> --unpacked=<pack2>
would only repack pack1 and pack2 and would not absorb
any loose blobs.  This would allow you to maintain any number
of packfile classes you want and maintain them yourself.
Each would be indicated by something different in a .keep file.
(To newly absorb loose blobs in a class,  you would do
 cat object-list | git-pack-objects --incremental
from some object-list you built following your rules).
These strategies would be too special-purpose to be in git,
but adding --packed is a small and useful change.

Shawn:  When I first saw the index-loading code,  my first
thought was that all the index tables should be
merged (easy since sorted) so callers only need to do one search.
With indices loaded lazily,  either you can't merge,  or you
merge sequentially, raising merge cost from (total entries) to
almost (index files) * (total entries).  What do you think about
merging the SHA-1 tables,  and how would/should it interact with
lazy index file loading?

BTW,  if it's not apparent,  I think my object density patch
should be dropped.  It has served its purpose as a thought experiment.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-26 17:31   ` Dana How
@ 2007-05-27  2:43     ` Nicolas Pitre
  2007-05-27  4:31       ` Dana How
  2007-05-27  3:34     ` Shawn O. Pearce
  1 sibling, 1 reply; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-27  2:43 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, Shawn O. Pearce, git

On Sat, 26 May 2007, Dana How wrote:

> I think there are two interesting strategies compatible
> with maximally-informative timestamps:
> 
> (1) git-repack -a -d repacks everything on each call.  You would need:
> (1a) Rewrite builtin-pack-objects.c so only the object_ix hash
>       accesses the "objects" array directly, everything else
>       goes through a pointer table.
> (1b) Sort the new pointer table by object type,  in order
>       tag -> commit -> tree -> nice blob -> naughty blob.
>      The sort is stable so the order within each group is unchanged.

This is not a good idea in general for runtime access to the pack.  If 
you consider a checkout, the commit object 
is looked up, then the root tree object, then each tree entry is 
recursively looked up.  Right now the way the objects are laid out, the 
most recent commit will have all its objects contiguously found in the 
pack and in the right order (that means tree and blobs mixed up).  This 
gets less and less true as you go back into history, but at least the 
recent stuff has a really nice access pattern.

Because commit objects are so fundamental to many graph operations they 
are already all packed together.  But tree and blob objects are 
intermixed for the reason stated above.

The naughty blob is a really special category and I think they should be 
treated as such. Therefore I don't think the common/normal case should 
be impacted with a generic change for something that is still a special 
case.

In other words, I think the naughty blob could simply be recognized as 
such and be referenced in a special list instead of being written out 
initially.  Then when everything is believed to be written, the special 
list can be walked to force write those naughty blob at last.  No need 
to modify the current object order.

Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-26 17:31   ` Dana How
  2007-05-27  2:43     ` Nicolas Pitre
@ 2007-05-27  3:34     ` Shawn O. Pearce
  2007-05-27  4:40       ` Dana How
  2007-05-27 15:26       ` Nicolas Pitre
  1 sibling, 2 replies; 26+ messages in thread
From: Shawn O. Pearce @ 2007-05-27  3:34 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, git

Dana How <danahow@gmail.com> wrote:
> Shawn:  When I first saw the index-loading code,  my first
> thought was that all the index tables should be
> merged (easy since sorted) so callers only need to do one search.

Yes; in fact this has been raised on the list before.  The general
idea was to create some sort of "super index" that had a list of
all objects and which packfile they could be found in.  This way the
running process doesn't have to search multiple indexes, and the
process doesn't have to be responsible for the merging itself.

See the thing is, if you read all of every .idx file on a simple
`git-log` operation you've already lost.  The number of trees and
blobs tends to far outweigh the number of commits and they really
outweigh the number of commits the average user looks at in a
`git-log` session before they abort their pager.  So sorting all
of the available .idx files before we produce even the first commit
is a horrible thing to do.

But the problem with a super index is repacking.  Every time the user
repacks their recent loose objects (or recently fetched packs) we are
folding some packfiles together, but may be leaving others alone.
The super index would need to account for the packfiles we aren't
looking at or repacking.  It gets complicated fast.

There's also the problem of alternate ODBs; do we fold the indexes
of our alternates into our own super index?  Or does each ODB get
its own super index and we still have to load multiple super index
files?

In pack v4 we're likely to move the SHA-1 table from the .idx file
into the front of the .pack file.  This makes the .idx file hold
only the offsets and the CRC checkums of each object.  If we start
making a super index, we have to duplicate the SHA-1 table twice
(once in the .pack, again in the super index).

-- 
Shawn.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27  2:43     ` Nicolas Pitre
@ 2007-05-27  4:31       ` Dana How
  2007-05-27 14:41         ` Nicolas Pitre
  0 siblings, 1 reply; 26+ messages in thread
From: Dana How @ 2007-05-27  4:31 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, Shawn O. Pearce, git, danahow

On 5/26/07, Nicolas Pitre <nico@cam.org> wrote:
> On Sat, 26 May 2007, Dana How wrote:
> > (1) git-repack -a -d repacks everything on each call.  You would need:
> > (1a) Rewrite builtin-pack-objects.c so only the object_ix hash
> >       accesses the "objects" array directly, everything else
> >       goes through a pointer table.
> > (1b) Sort the new pointer table by object type,  in order
> >       tag -> commit -> tree -> nice blob -> naughty blob.
> >      The sort is stable so the order within each group is unchanged.
>
> Because commit objects are so fundamental to many graph operations they
> are already all packed together.  But tree and blob objects are
> intermixed for the reason stated above.
I noticed that all the commits were together and
wondered if that was deliberate.

> The naughty blob is a really special category and I think they should be
> treated as such. Therefore I don't think the common/normal case should
> be impacted with a generic change for something that is still a special
> case.
This argument makes sense.

> In other words, I think the naughty blob could simply be recognized as
> such and be referenced in a special list instead of being written out
> initially.  Then when everything is believed to be written, the special
> list can be walked to force write those naughty blob at last.  No need
> to modify the current object order.
This works as long as a naughty blob can't be a delta base for a nice blob
(causing it to be pushed out early by the recursion in write_one()).
I think that's a reasonable and understandable restriction.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27  3:34     ` Shawn O. Pearce
@ 2007-05-27  4:40       ` Dana How
  2007-05-27 15:29         ` Nicolas Pitre
  2007-05-27 15:26       ` Nicolas Pitre
  1 sibling, 1 reply; 26+ messages in thread
From: Dana How @ 2007-05-27  4:40 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, git, danahow

On 5/26/07, Shawn O. Pearce <spearce@spearce.org> wrote:
> Dana How <danahow@gmail.com> wrote:
> > Shawn:  When I first saw the index-loading code,  my first
> > thought was that all the index tables should be
> > merged (easy since sorted) so callers only need to do one search.
>
> Yes; in fact this has been raised on the list before.  The general
> idea was to create some sort of "super index" that had a list of
> all objects and which packfile they could be found in.  This way the
> running process doesn't have to search multiple indexes, and the
> process doesn't have to be responsible for the merging itself.
>
> See the thing is, if you read all of every .idx file on a simple
> `git-log` operation you've already lost.  The number of trees and
> blobs tends to far outweigh the number of commits and they really
> outweigh the number of commits the average user looks at in a
> `git-log` session before they abort their pager.  So sorting all
> of the available .idx files before we produce even the first commit
> is a horrible thing to do.
>
> But the problem with a super index is repacking.  Every time the user
> repacks their recent loose objects (or recently fetched packs) we are
> folding some packfiles together, but may be leaving others alone.
> The super index would need to account for the packfiles we aren't
> looking at or repacking.  It gets complicated fast.
>
> There's also the problem of alternate ODBs; do we fold the indexes
> of our alternates into our own super index?  Or does each ODB get
> its own super index and we still have to load multiple super index
> files?
Yes,  the problem is that even an on-demand, "lazy" merge
is likely to require far more work than the expected number of index probes.

> In pack v4 we're likely to move the SHA-1 table from the .idx file
> into the front of the .pack file.  This makes the .idx file hold
> only the offsets and the CRC checkums of each object.  If we start
> making a super index, we have to duplicate the SHA-1 table twice
> (once in the .pack, again in the super index).
Hmm, hopefully the SHA-1 table can go at the _end_
since with split packs that's the only time we know the number
of objects in the pack... ;-)

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
@ 2007-05-27 10:46 Martin Koegler
  2007-05-27 15:36 ` Nicolas Pitre
  0 siblings, 1 reply; 26+ messages in thread
From: Martin Koegler @ 2007-05-27 10:46 UTC (permalink / raw)
  To: Dana How; +Cc: Shawn O. Pearce, Junio C Hamano, git

Dana How wrote:
> (1c) Do not deltify naughty blobs.  Naughty blobs are those
>      blobs marked "nodelta" or very large blobs.

I don't like the idea to exclude any blobs from delta by default, if
the delta could be done. If the "very large blobs" are text files with
very few difference, they deltifiy very well.

Additionlly, how do you want to define "very large blobs"? 

For a system with eg. 256MB of RAM, deltifiny a blob with some hundred
MBs is a problem whereas it is no problem, if you have some GB of RAM.

mfg Martin Kögler

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27  4:31       ` Dana How
@ 2007-05-27 14:41         ` Nicolas Pitre
  0 siblings, 0 replies; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-27 14:41 UTC (permalink / raw)
  To: Dana How; +Cc: Junio C Hamano, Shawn O. Pearce, git

On Sat, 26 May 2007, Dana How wrote:

> On 5/26/07, Nicolas Pitre <nico@cam.org> wrote:
> > In other words, I think the naughty blob could simply be recognized as
> > such and be referenced in a special list instead of being written out
> > initially.  Then when everything is believed to be written, the special
> > list can be walked to force write those naughty blob at last.  No need
> > to modify the current object order.
> This works as long as a naughty blob can't be a delta base for a nice blob
> (causing it to be pushed out early by the recursion in write_one()).
> I think that's a reasonable and understandable restriction.

Sure.  Or the delta can inherit the naughty property if its base is also 
naughty, which solves the problem nicely.


Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27  3:34     ` Shawn O. Pearce
  2007-05-27  4:40       ` Dana How
@ 2007-05-27 15:26       ` Nicolas Pitre
  2007-05-27 16:06         ` Dana How
  2007-05-27 21:52         ` Shawn O. Pearce
  1 sibling, 2 replies; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-27 15:26 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Dana How, Junio C Hamano, git

On Sat, 26 May 2007, Shawn O. Pearce wrote:

> Dana How <danahow@gmail.com> wrote:
> > Shawn:  When I first saw the index-loading code,  my first
> > thought was that all the index tables should be
> > merged (easy since sorted) so callers only need to do one search.
> 
> Yes; in fact this has been raised on the list before.  The general
> idea was to create some sort of "super index" that had a list of
> all objects and which packfile they could be found in.  This way the
> running process doesn't have to search multiple indexes, and the
> process doesn't have to be responsible for the merging itself.
> 
> See the thing is, if you read all of every .idx file on a simple
> `git-log` operation you've already lost.  The number of trees and
> blobs tends to far outweigh the number of commits and they really
> outweigh the number of commits the average user looks at in a
> `git-log` session before they abort their pager.  So sorting all
> of the available .idx files before we produce even the first commit
> is a horrible thing to do.

There is also the question of memory footprint.  If you have a global 
index, then for each object you need to have a tupple containing SHA1 + 
pack offset + reference to corresponding pack.  Right now we only need 
SHA1 + pack offset.

BTW I think the Newton-Raphson based index lookup approach should be 
revived at some point.


Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27  4:40       ` Dana How
@ 2007-05-27 15:29         ` Nicolas Pitre
  2007-05-27 21:35           ` Shawn O. Pearce
  0 siblings, 1 reply; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-27 15:29 UTC (permalink / raw)
  To: Dana How; +Cc: Shawn O. Pearce, Junio C Hamano, git

On Sat, 26 May 2007, Dana How wrote:

> On 5/26/07, Shawn O. Pearce <spearce@spearce.org> wrote:
> > In pack v4 we're likely to move the SHA-1 table from the .idx file
> > into the front of the .pack file.  This makes the .idx file hold
> > only the offsets and the CRC checkums of each object.  If we start
> > making a super index, we have to duplicate the SHA-1 table twice
> > (once in the .pack, again in the super index).
> Hmm, hopefully the SHA-1 table can go at the _end_
> since with split packs that's the only time we know the number
> of objects in the pack... ;-)

Hmmm good point to consider.


Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27 10:46 Martin Koegler
@ 2007-05-27 15:36 ` Nicolas Pitre
  0 siblings, 0 replies; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-27 15:36 UTC (permalink / raw)
  To: Martin Koegler; +Cc: Dana How, Shawn O. Pearce, Junio C Hamano, git

On Sun, 27 May 2007, Martin Koegler wrote:

> Dana How wrote:
> > (1c) Do not deltify naughty blobs.  Naughty blobs are those
> >      blobs marked "nodelta" or very large blobs.
> 
> I don't like the idea to exclude any blobs from delta by default, if
> the delta could be done.

It won't happen by default.

> If the "very large blobs" are text files with
> very few difference, they deltifiy very well.
> 
> Additionlly, how do you want to define "very large blobs"? 

This is indeed a per repository attribute that is highly dependent on 
your data set.


Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27 15:26       ` Nicolas Pitre
@ 2007-05-27 16:06         ` Dana How
  2007-05-27 21:52         ` Shawn O. Pearce
  1 sibling, 0 replies; 26+ messages in thread
From: Dana How @ 2007-05-27 16:06 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Shawn O. Pearce, Junio C Hamano, git, danahow

On 5/27/07, Nicolas Pitre <nico@cam.org> wrote:
> BTW I think the Newton-Raphson based index lookup approach should be
> revived at some point.
Yes.

 I think if we figure out the statistics we could win big.
I thought about it a bit when it was first discussed
but need to return to it.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27 15:29         ` Nicolas Pitre
@ 2007-05-27 21:35           ` Shawn O. Pearce
  2007-05-28  1:35             ` Dana How
  2007-05-28  2:18             ` Nicolas Pitre
  0 siblings, 2 replies; 26+ messages in thread
From: Shawn O. Pearce @ 2007-05-27 21:35 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Dana How, Junio C Hamano, git

Nicolas Pitre <nico@cam.org> wrote:
> On Sat, 26 May 2007, Dana How wrote:
> 
> > On 5/26/07, Shawn O. Pearce <spearce@spearce.org> wrote:
> > > In pack v4 we're likely to move the SHA-1 table from the .idx file
> > > into the front of the .pack file.  This makes the .idx file hold
> > > only the offsets and the CRC checkums of each object.  If we start
> > > making a super index, we have to duplicate the SHA-1 table twice
> > > (once in the .pack, again in the super index).
> > 
> > Hmm, hopefully the SHA-1 table can go at the _end_
> > since with split packs that's the only time we know the number
> > of objects in the pack... ;-)
> 
> Hmmm good point to consider.

The problem with putting the SHA-1 table at the end of the pack is
it ruins the streaming for both unpack-objects and index-pack if
we were to ever use pack v4 as a transport format.  Or just try
to run a pack v4 packfile through unpack-objects, just locally,
say to extract megablobs.  ;-)

-- 
Shawn.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27 15:26       ` Nicolas Pitre
  2007-05-27 16:06         ` Dana How
@ 2007-05-27 21:52         ` Shawn O. Pearce
  2007-05-27 23:35           ` Nicolas Pitre
  1 sibling, 1 reply; 26+ messages in thread
From: Shawn O. Pearce @ 2007-05-27 21:52 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Dana How, Junio C Hamano, git

Nicolas Pitre <nico@cam.org> wrote:
> On Sat, 26 May 2007, Shawn O. Pearce wrote:
> 
> > Dana How <danahow@gmail.com> wrote:
> > > Shawn:  When I first saw the index-loading code,  my first
> > > thought was that all the index tables should be
> > > merged (easy since sorted) so callers only need to do one search.
> 
> There is also the question of memory footprint.  If you have a global 
> index, then for each object you need to have a tupple containing SHA1 + 
> pack offset + reference to corresponding pack.  Right now we only need 
> SHA1 + pack offset.

I'm about half-way through a super-index implementation.  Right now
the super-index is defined to be just one index file per repository
(objects/pack/super.index) with a format that looks like the
following:

  header:
    uint32_t sdx_signature ('PSDX')
    uint32_t sdx_version (1)
    uint16_t sdx_packs
    uint8_t  sdx_prefix_len;
    uint8_t  __reserved;

  pack table:
    /* SHA-1 of each pack's sorted SHA-1 object list */
    unsigned char pack_sha1[20][sdx_packs];

  fan-out table:
    /* This is the standard fan-out also used in a tOc/.idx */
    uint32_t fan_out[256];

  records:
    unsigned char prefix[hdr.sdx_prefix_len - 1];
    int8_t        splits[hdr.sdx_packs];

  trailer:
    unsigned char sha1_of_the_above[20];

I build the super-index by merging the .idx of all available
packfiles; since they are already sorted the merge is obviously
quite trivial.

The sdx_prefix_len field is initialized to something that gives a
reasonably unique object name; e.g. in git.git an sdx_prefix_len
of 3 or 4 gets pretty good at narrowing the set of objects down
very small.  The idea here is that the sdx_prefix_len should be
almost the unique abbreviation length for this repository.

We store 1 less byte of the prefix in the record because of the
fan-out table already accounting for the first byte of the prefix.

The splits array contains a single signed integer for each packfile;
if the integer is 0 then that packfile does not contain any object
that starts with that corresponding prefix.  In such a case we
can completely avoid looking at that corresponding packfile.
With my lazy index loading change, I may not even need to open
that index.  ;-)

If the splits array entry is non-zero and is negative, its the number
of times we need to halve down (towards 'lo') to get to entries
that start with this prefix and that are in that packfile's fan-out
table entry range.  If its positive its the number of times we have
to halve up (towards 'hi').  This way we can jump more directly to
the relevant index records and avoid redoing binary search work we
have already accomplished in the super index.

So we can generally build super index records at a cost of 3 or
4 bytes + sdx_packs.  We can also determine which packfile(s) we
need to scan quite quickly, and we can jump further into the part
of the index avoiding a number of expensive hashcmp() calls.  It
may actually be a good savings at runtime, well worth the slightly
higher memory footprint.

My testing is not yet complete, so I cannot offer any hard numbers
against any interesting/common data sets...

> BTW I think the Newton-Raphson based index lookup approach should be 
> revived at some point.

That doesn't help with 10 packfiles though, does it?

-- 
Shawn.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27 21:52         ` Shawn O. Pearce
@ 2007-05-27 23:35           ` Nicolas Pitre
  2007-05-28 16:22             ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-27 23:35 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Dana How, Junio C Hamano, git

On Sun, 27 May 2007, Shawn O. Pearce wrote:

> Nicolas Pitre <nico@cam.org> wrote:
> > BTW I think the Newton-Raphson based index lookup approach should be 
> > revived at some point.
> 
> That doesn't help with 10 packfiles though, does it?

It helps irrespective of the number of pack files.  With the current 
binary search the lookup cost is O(log n).  With a Newton method this 
cost is almost O(1).  If you have 10 pack files then you may have to do 
5 separate lookups on average, but those lookups are still faster with a 
Newton method.

Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27 21:35           ` Shawn O. Pearce
@ 2007-05-28  1:35             ` Dana How
  2007-05-28  2:30               ` A Large Angry SCM
  2007-05-28 18:31               ` Nicolas Pitre
  2007-05-28  2:18             ` Nicolas Pitre
  1 sibling, 2 replies; 26+ messages in thread
From: Dana How @ 2007-05-28  1:35 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Nicolas Pitre, Junio C Hamano, git, danahow

On 5/27/07, Shawn O. Pearce <spearce@spearce.org> wrote:
> Nicolas Pitre <nico@cam.org> wrote:
> > On Sat, 26 May 2007, Dana How wrote:
> > > On 5/26/07, Shawn O. Pearce <spearce@spearce.org> wrote:
> > > > In pack v4 we're likely to move the SHA-1 table from the .idx file
> > > > into the front of the .pack file.  This makes the .idx file hold
> > > > only the offsets and the CRC checkums of each object.  If we start
> > > > making a super index, we have to duplicate the SHA-1 table twice
> > > > (once in the .pack, again in the super index).
> > > Hmm, hopefully the SHA-1 table can go at the _end_
> > > since with split packs that's the only time we know the number
> > > of objects in the pack... ;-)
> > Hmmm good point to consider.
> The problem with putting the SHA-1 table at the end of the pack is
> it ruins the streaming for both unpack-objects and index-pack if
> we were to ever use pack v4 as a transport format.  Or just try
> to run a pack v4 packfile through unpack-objects, just locally,
> say to extract megablobs.  ;-)

Perhaps I have stumbled on another issue related to
including SHA-1s in packfiles.  This is completely independent
of the handling of megablobs (my current focus),  but the presence
of large blobs do make this issue more apparent.

Some history of what I've been doing with git:
First I simply had to import the repo,
which led to split packs (this was before index v2).
Then maintaining the repo led to the unfinished maxblobsize stuff.
Distributing the repo included users pulling (usually) from the central repo,
which would be trivial since it was also an alternate.
Local repacking would avoid heavy load on it.

Now I've started looking into how to push back into the
central repo from a user's repo (not everything will be central;
some pulling between users will occur
otherwise I wouldn't be as interested).

It looks like the entire sequence is:
A. git add file [compute SHA-1 & compress file into objects/xx]
B. git commit [write some small objects locally]
C. git push {using PROTO_LOCAL}:
 1. read & uncompress objects
 2. recompress objects into a pack and send through a pipe
 3. read pack on other end of pipe and uncompress each object
 4. compute SHA-1 for each object and compress file into objects/xx

So, after creating an object in the local working tree,
to get it into the central repo,  we must:
compress -> uncompress -> compress -> uncompress -> compress.
In responsiveness this won't compare very well to Perforce,
which has only one compress step.

The sequence above could be somewhat different currently in git.
The user might have repacked their repo before pushing,
but this just moves C1 and C2 back earlier in time,
it doesn't remove the need for them.  Besides,  the blobs in
a push are more likely to be recent and hence unpacked.

Also,  C3 and C4 might not happen if more than 100 blobs get pushed.
But this seems very unusual; only 0.3% of commits in the history
had 100+ new files/file contents.  If the 100 level is reduced,
then the central repo fills up with packfiles and their index files,
reducing performance for everybody (using the central repo as an alternate).

Thus there really is 5X more compression activity going on
compared to Perforce.  How can this be reduced?

One way is to restore the ability to write the "new" loose object format.
Then C1, C2, and C4 disappear.  C3 must remain because we need
to uncompress the object to compute its SHA-1;  we don't need
to recompress since we were already given the compressed form.

And that final sentence is why I sent this email:  if the packfile
contained the SHA-1s,  either at the beginning or before each object,
then they wouldn't need to be recomputed at the receiving end
and the extra decompression could be skipped as well.  This would
make the total zlib effort the same as Perforce.

The fact that a loose object is never overwritten would still be retained.
Is that sufficient security?  Or does the SHA-1 always need to be
recomputed on the receiving end?  Could that be skipped just for
specific connections and/or protocols (presumably "trusted" ones)?

Note that none of this depends on how megablobs are handled,
but large blobs certainly do make the 5X more zlib activity more apparent.

Shawn:  I think you mentioned something related to this a few days ago.
Also, you didn't like split packs putting SHA-1s at the end because
that messed up streaming for transport, but packs are not split for transport.

Thanks,
-- 
Dana L. How  danahow@gmail.com  +1 650 804 5991 cell

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27 21:35           ` Shawn O. Pearce
  2007-05-28  1:35             ` Dana How
@ 2007-05-28  2:18             ` Nicolas Pitre
  1 sibling, 0 replies; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-28  2:18 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Dana How, Junio C Hamano, git

On Sun, 27 May 2007, Shawn O. Pearce wrote:

> Nicolas Pitre <nico@cam.org> wrote:
> > On Sat, 26 May 2007, Dana How wrote:
> > 
> > > On 5/26/07, Shawn O. Pearce <spearce@spearce.org> wrote:
> > > > In pack v4 we're likely to move the SHA-1 table from the .idx file
> > > > into the front of the .pack file.  This makes the .idx file hold
> > > > only the offsets and the CRC checkums of each object.  If we start
> > > > making a super index, we have to duplicate the SHA-1 table twice
> > > > (once in the .pack, again in the super index).
> > > 
> > > Hmm, hopefully the SHA-1 table can go at the _end_
> > > since with split packs that's the only time we know the number
> > > of objects in the pack... ;-)
> > 
> > Hmmm good point to consider.
> 
> The problem with putting the SHA-1 table at the end of the pack is
> it ruins the streaming for both unpack-objects and index-pack if
> we were to ever use pack v4 as a transport format.  Or just try
> to run a pack v4 packfile through unpack-objects, just locally,
> say to extract megablobs.  ;-)

Right.  In fact I think the SHA1 table could still remain at the 
beginning even if we don't know yet that the pack will be split. It 
would just happen to contain redundent entries.

In fact it would be impossible to store it at the end in the hope of 
trimming it down according to the written objects because the idea is to 
have commit and tree objects index into that table.  So if you cull 
entries from the table then indices in already written out objects won't 
match.


Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-28  1:35             ` Dana How
@ 2007-05-28  2:30               ` A Large Angry SCM
  2007-05-28 18:31               ` Nicolas Pitre
  1 sibling, 0 replies; 26+ messages in thread
From: A Large Angry SCM @ 2007-05-28  2:30 UTC (permalink / raw)
  To: Dana How; +Cc: Shawn O. Pearce, Nicolas Pitre, Junio C Hamano, git

Dana How wrote:
[...]
> 
> Some history of what I've been doing with git:
> First I simply had to import the repo,
> which led to split packs (this was before index v2).
> Then maintaining the repo led to the unfinished maxblobsize stuff.
> Distributing the repo included users pulling (usually) from the central 
> repo,
> which would be trivial since it was also an alternate.
> Local repacking would avoid heavy load on it.
> 
> Now I've started looking into how to push back into the
> central repo from a user's repo (not everything will be central;
> some pulling between users will occur
> otherwise I wouldn't be as interested).
> 
> It looks like the entire sequence is:
> A. git add file [compute SHA-1 & compress file into objects/xx]
> B. git commit [write some small objects locally]
> C. git push {using PROTO_LOCAL}:
> 1. read & uncompress objects
> 2. recompress objects into a pack and send through a pipe
> 3. read pack on other end of pipe and uncompress each object
> 4. compute SHA-1 for each object and compress file into objects/xx
> 
> So, after creating an object in the local working tree,
> to get it into the central repo,  we must:
> compress -> uncompress -> compress -> uncompress -> compress.
> In responsiveness this won't compare very well to Perforce,
> which has only one compress step.
> 
> The sequence above could be somewhat different currently in git.
> The user might have repacked their repo before pushing,
> but this just moves C1 and C2 back earlier in time,
> it doesn't remove the need for them.  Besides,  the blobs in
> a push are more likely to be recent and hence unpacked.
> 
> Also,  C3 and C4 might not happen if more than 100 blobs get pushed.
> But this seems very unusual; only 0.3% of commits in the history
> had 100+ new files/file contents.  If the 100 level is reduced,
> then the central repo fills up with packfiles and their index files,
> reducing performance for everybody (using the central repo as an 
> alternate).
> 
> Thus there really is 5X more compression activity going on
> compared to Perforce.  How can this be reduced?
> 
> One way is to restore the ability to write the "new" loose object format.
> Then C1, C2, and C4 disappear.  C3 must remain because we need
> to uncompress the object to compute its SHA-1;  we don't need
> to recompress since we were already given the compressed form.
> 
> And that final sentence is why I sent this email:  if the packfile
> contained the SHA-1s,  either at the beginning or before each object,
> then they wouldn't need to be recomputed at the receiving end
> and the extra decompression could be skipped as well.  This would
> make the total zlib effort the same as Perforce.
> 
> The fact that a loose object is never overwritten would still be retained.
> Is that sufficient security?  Or does the SHA-1 always need to be
> recomputed on the receiving end?  Could that be skipped just for
> specific connections and/or protocols (presumably "trusted" ones)?
[...]

So how do you want to decide when to trust the sender and when to 
validate that the objects received have the SHA-1's claimed? A _central_ 
repository, being authoritative, would need to _always_ validate _all_ 
objects it receives. An since, with a central repository setup, the 
central repository is where the CPU resources are the most in demand, 
validating the object IDs when received at the developers repositories 
should not be a problem. And just to be fair, how does Perforce 
guarantee that the retrieved version of a file matches what was checked in?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-27 23:35           ` Nicolas Pitre
@ 2007-05-28 16:22             ` Linus Torvalds
  2007-05-28 17:13               ` Nicolas Pitre
  2007-05-28 17:40               ` Karl Hasselström
  0 siblings, 2 replies; 26+ messages in thread
From: Linus Torvalds @ 2007-05-28 16:22 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Shawn O. Pearce, Dana How, Junio C Hamano, git

On Sun, 27 May 2007, Nicolas Pitre wrote:
> 
> It helps irrespective of the number of pack files.  With the current 
> binary search the lookup cost is O(log n).  With a Newton method this 
> cost is almost O(1).

This is not true.

First off, when comparing O(logn) to O(1), with "n" being less than a 
billion, they are pretty much exactly the same. Think of it this way: 
O(logn) == O(9) == O(1), if you know that n < 10**9.

Secondly, the cost of Newton isn't "almost O(1)". I don't know _what_ it 
is (the rule of thumb with Newton-Raphson should be that the number of 
significant correct digits in the answer doubles with each iteration: I 
think that probably means that it should approximate O(loglog(n)), but I 
haven't thought deeply about it.

But regardless, we end up testing "a few" values. Both for the binary 
search and for Newton-Raphson.

So every time you talk about O-notation, you should also consider the 
constant costs, especially if the function in question is a slow-changing 
one (ie when we start talking O(N**3) we can start ignoring the constants. 
With O(logn), you sure as hell cannot!)

And the thing is, Newton-Raphson didn't actually speed anything up in my 
tests. Sometimes it was better, sometimes it was worse, most of the time 
it was in the noise.

Now, I'm sure the thing could be tweaked. Maybe I didn't do a very good 
job at my initial implementation. It was a quick hack. But before somebody 
makes a better one and actually shows better performance, I'd say that the 
jury is still out.

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-28 16:22             ` Linus Torvalds
@ 2007-05-28 17:13               ` Nicolas Pitre
  2007-05-28 17:40               ` Karl Hasselström
  1 sibling, 0 replies; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-28 17:13 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Shawn O. Pearce, Dana How, Junio C Hamano, git

On Mon, 28 May 2007, Linus Torvalds wrote:

> On Sun, 27 May 2007, Nicolas Pitre wrote:
> > 
> > It helps irrespective of the number of pack files.  With the current 
> > binary search the lookup cost is O(log n).  With a Newton method this 
> > cost is almost O(1).
> 
> This is not true.
> 
> First off, when comparing O(logn) to O(1), with "n" being less than a 
> billion, they are pretty much exactly the same. Think of it this way: 
> O(logn) == O(9) == O(1), if you know that n < 10**9.

Agreed.

> Secondly, the cost of Newton isn't "almost O(1)". I don't know _what_ it 
> is (the rule of thumb with Newton-Raphson should be that the number of 
> significant correct digits in the answer doubles with each iteration: I 
> think that probably means that it should approximate O(loglog(n)), but I 
> haven't thought deeply about it.

Sure.  But given the reasoning you gave above for O(log N) being about 
the same as O(1) for a small enough N, I think that O(log log N) is even 
closer to O(1) in that regard.

> And the thing is, Newton-Raphson didn't actually speed anything up in my 
> tests. Sometimes it was better, sometimes it was worse, most of the time 
> it was in the noise.

OK I didn't remember the difference was so unconclusive.
The constant cost is certainly a big factor in the equation.

Maybe we didn't test with a big enough repo yet (big in the sense of 
number of objects) to see the variable cost actually dominate.

Did you try with the KDE repo at the time?  It has over 4M objects.
The binary search would require 14 itterations while the Newton search 
should take around 3 or 4 itterations if the log O(log N) esttimate is 
right.  Of course the evaluation of the next mid point is more costly 
with the Newton method.

Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-28 16:22             ` Linus Torvalds
  2007-05-28 17:13               ` Nicolas Pitre
@ 2007-05-28 17:40               ` Karl Hasselström
  1 sibling, 0 replies; 26+ messages in thread
From: Karl Hasselström @ 2007-05-28 17:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Nicolas Pitre, Shawn O. Pearce, Dana How, Junio C Hamano, git

On 2007-05-28 09:22:02 -0700, Linus Torvalds wrote:

> Secondly, the cost of Newton isn't "almost O(1)". I don't know
> _what_ it is (the rule of thumb with Newton-Raphson should be that
> the number of significant correct digits in the answer doubles with
> each iteration: I think that probably means that it should
> approximate O(loglog(n)), but I haven't thought deeply about it.

With binary search, the number of correct bits in the index increases
by 1 for each iteration. With Newton iteration, the number of correct
bits doubles for each iteration. So the number of Newton iteration
should be the log of the number of binary search iterations, i.e. log
log n like you say.

But for non-astronomical values of n, I agree that this kind of big-O
analysis is much inferior to benchmarking.

-- 
Karl Hasselström, kha@treskal.com
      www.treskal.com/kalle

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-28  1:35             ` Dana How
  2007-05-28  2:30               ` A Large Angry SCM
@ 2007-05-28 18:31               ` Nicolas Pitre
  1 sibling, 0 replies; 26+ messages in thread
From: Nicolas Pitre @ 2007-05-28 18:31 UTC (permalink / raw)
  To: Dana How; +Cc: Shawn O. Pearce, Junio C Hamano, git

On Sun, 27 May 2007, Dana How wrote:

> Perhaps I have stumbled on another issue related to
> including SHA-1s in packfiles.  This is completely independent
> of the handling of megablobs (my current focus),  but the presence
> of large blobs do make this issue more apparent.
> 
> Some history of what I've been doing with git:
> First I simply had to import the repo,
> which led to split packs (this was before index v2).
> Then maintaining the repo led to the unfinished maxblobsize stuff.
> Distributing the repo included users pulling (usually) from the central repo,
> which would be trivial since it was also an alternate.
> Local repacking would avoid heavy load on it.
> 
> Now I've started looking into how to push back into the
> central repo from a user's repo (not everything will be central;
> some pulling between users will occur
> otherwise I wouldn't be as interested).
> 
> It looks like the entire sequence is:
> A. git add file [compute SHA-1 & compress file into objects/xx]
> B. git commit [write some small objects locally]
> C. git push {using PROTO_LOCAL}:
> 1. read & uncompress objects
> 2. recompress objects into a pack and send through a pipe
> 3. read pack on other end of pipe and uncompress each object
> 4. compute SHA-1 for each object and compress file into objects/xx
> 
> So, after creating an object in the local working tree,
> to get it into the central repo,  we must:
> compress -> uncompress -> compress -> uncompress -> compress.
> In responsiveness this won't compare very well to Perforce,
> which has only one compress step.
> 
> The sequence above could be somewhat different currently in git.
> The user might have repacked their repo before pushing,
> but this just moves C1 and C2 back earlier in time,
> it doesn't remove the need for them.  Besides,  the blobs in
> a push are more likely to be recent and hence unpacked.
> 
> Also,  C3 and C4 might not happen if more than 100 blobs get pushed.
> But this seems very unusual; only 0.3% of commits in the history
> had 100+ new files/file contents.  If the 100 level is reduced,
> then the central repo fills up with packfiles and their index files,
> reducing performance for everybody (using the central repo as an alternate).

But it makes repacking on the central repo much easier, easily regaining 
performance for everybody, even more so than if objects were left loose.  
You could even use one of GIT's many hooks to asynchronously launch a 
repack whenever a push has been completed to keep the number of server 
side packs low.

Also, the treshold of 100 is for all objects, not only files.  So if you 
do multiple commits before pushing (a workflow that GIT strongly 
encourage) then you need only 12 commits modifying only 3 file each 
with those files located in a subdirectory of their own to create 96 
objects already.  It grows even faster if the modified files are more 
deeply nested.

The 100 object treshold was arbitrarily determined and no performance 
tests were ever done to confirm if this is the best value.

> Thus there really is 5X more compression activity going on
> compared to Perforce.  How can this be reduced?
> 
> One way is to restore the ability to write the "new" loose object format.
> Then C1, C2, and C4 disappear.  C3 must remain because we need
> to uncompress the object to compute its SHA-1;  we don't need
> to recompress since we were already given the compressed form.

C4 isn't mandatory if you keep object packed on the remote end like I 
suggest above.

Note that C1 is unavoidable when locally repacking.  It is the only way 
to make sure the new pack won't blindly reuse data that might have been 
corrupted on disk (think random bit flip), and wrap it with a valid SHA1 
that includes the corrupted data which is even worse.  And yes it 
happened at least once with a report on this list.  This is why we are a 
bit paranoid about data integrity.  And even if the server currently 
recompute object SHA1 upon reception of a pack, I think C1 during a push 
is a good thing too just so you don't bother the server if your own repo 
is locally corrupted.

And yet you need C1 anyway for deltification in most normal cases.

As for C2, you should see it as helping the server repack objects.  If 
C2 is distributed amongst all client (including C1 and deltification) it 
is then much less visible and the server doesn't need to do that part 
when repacking.

> And that final sentence is why I sent this email:  if the packfile
> contained the SHA-1s,  either at the beginning or before each object,
> then they wouldn't need to be recomputed at the receiving end
> and the extra decompression could be skipped as well.  This would
> make the total zlib effort the same as Perforce.

This cannot be done.  GIT, being distributed, means that you should not 
trust the remote end.  The notion of validation and trust comes from the 
fact that GIT actively validate the SHA1 of what it is fed so a rogue 
server (or even client) cannot just substitute an object for another 
(with or without malice) and pretend everything is fine.

> The fact that a loose object is never overwritten would still be retained.
> Is that sufficient security?  Or does the SHA-1 always need to be
> recomputed on the receiving end?  Could that be skipped just for
> specific connections and/or protocols (presumably "trusted" ones)?

Even trusted connections can sometimes transmit bad bits.  Corruption 
_will_ happen someday to someone.  It already happened to people on this 
list.  Hardware is not immune to faults, and the bigger is your repo, 
the more likely you're susceptible to such things.  If SHA1 is validated 
on each hop then any corruption won't be able to propagate.

> Shawn:  I think you mentioned something related to this a few days ago.
> Also, you didn't like split packs putting SHA-1s at the end because
> that messed up streaming for transport, but packs are not split for transport.

Like I stated elsewhere the SHA1 table has no advantage being at the end 
anyway since it cannot be trimmed down even for split packs.

Nicolas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
@ 2007-05-29  0:09 linux
  2007-05-29  3:26 ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: linux @ 2007-05-29  0:09 UTC (permalink / raw)
  To: git, torvalds

> First off, when comparing O(logn) to O(1), with "n" being less than a 
> billion, they are pretty much exactly the same. Think of it this way: 
> O(logn) == O(9) == O(1), if you know that n < 10**9.

Well, binary searches mean binary logarithms, so O(log n) = O(30).
Still, pretty low.

> Secondly, the cost of Newton isn't "almost O(1)". I don't know _what_ it 
> is (the rule of thumb with Newton-Raphson should be that the number of 
> significant correct digits in the answer doubles with each iteration: I 
> think that probably means that it should approximate O(loglog(n)), but I 
> haven't thought deeply about it.

Excellent intuition!  The algorithm is most commonly known in the computer
science literature as "interpolation search" and it does indeed take O(log
log n) time for uniformly distributed data, which is a good assumption
for SHA-1 hashes.

Of course, for n = 10**9, log(n) is 30 and log log n is 5.
More to the point, for n = 10**6, log(n) is 20 and log(log(n)) is still 5.

Even losing a constant factor of 2, it still seems like it might offer a
factor-of-2 speedup for large repositories.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] Lazily open pack index files on demand
  2007-05-29  0:09 linux
@ 2007-05-29  3:26 ` Linus Torvalds
  0 siblings, 0 replies; 26+ messages in thread
From: Linus Torvalds @ 2007-05-29  3:26 UTC (permalink / raw)
  To: linux; +Cc: git

On Mon, 28 May 2007, linux@horizon.com wrote:
> 
> Even losing a constant factor of 2, it still seems like it might offer a 
> factor-of-2 speedup for large repositories.

Well, not so much according to the numbers I had.

Yes, SHA-1's are very nicely distributed on a larger scale, but on a 
_small_ scale they aren't. 

So you end up being able to get very good initial guesses for the first 
few iterations, but once you get "close enough", you can't do anything at 
all, and then you're actually worse off.

Also, please do realize that the binary search is actually a *smart* 
binary search, which does a radix-256 fan-out at the beginning, getting 
rid of the first 8 levels of searching for free.

The Newton-Raphson thing can do that too (and in my trial patch, did), but 
it means that you actually get rid of just the initial guess (the one that 
worked the best!) and you still have the problem that once you get close 
enough, the distribution at a local level is not at all nice and linear.

So what I did with my patch (you can find it in the archives - search for 
Newton-Raphson, I'd say about 3 months ago) was to do three cycles of 
approximating, and then a linear search. The linear search has good cache 
behaviour, so it's not as horrid as it might sound, but for some numbers I 
did, my approximate thing would hit on the exact first entry about half 
the time, but would have to walk up to ~20 entries occasionally.

Which meant that the binary search (with the initial radix-256 fan-out) 
actually mostly outperformed the Newton-Raphson thing.

Also, object lookup is certainly noticeable, but the biggest cost of it is 
the cache misses, and even then it's not really _that_ noticeable. And 
it's really neither slow nor stupid as it is.

So I'd love to see somebody try to do a _proper_ apprixmation of Newton- 
Raphson, not the quick hack I did. But I think factor-of-two is actually 
optimistic, even for pretty large repos. And it would have to be no worse 
than what we have now for average-sized ones.

(And object lookup time is generally not the dominant cost, so while it's 
noticeable, cutting it down by even a whole 50% isn't going to speed up 
any normal git operations by more than a couple of percent.. Generating 
diffs in particular is a much more costly op for things like "git blame")

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-05-29  3:26 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-05-26  5:24 [PATCH 1/3] Lazily open pack index files on demand Shawn O. Pearce
2007-05-26  8:29 ` Junio C Hamano
2007-05-26 17:30   ` Shawn O. Pearce
2007-05-26 17:31   ` Dana How
2007-05-27  2:43     ` Nicolas Pitre
2007-05-27  4:31       ` Dana How
2007-05-27 14:41         ` Nicolas Pitre
2007-05-27  3:34     ` Shawn O. Pearce
2007-05-27  4:40       ` Dana How
2007-05-27 15:29         ` Nicolas Pitre
2007-05-27 21:35           ` Shawn O. Pearce
2007-05-28  1:35             ` Dana How
2007-05-28  2:30               ` A Large Angry SCM
2007-05-28 18:31               ` Nicolas Pitre
2007-05-28  2:18             ` Nicolas Pitre
2007-05-27 15:26       ` Nicolas Pitre
2007-05-27 16:06         ` Dana How
2007-05-27 21:52         ` Shawn O. Pearce
2007-05-27 23:35           ` Nicolas Pitre
2007-05-28 16:22             ` Linus Torvalds
2007-05-28 17:13               ` Nicolas Pitre
2007-05-28 17:40               ` Karl Hasselström
  -- strict thread matches above, loose matches on Subject: below --
2007-05-27 10:46 Martin Koegler
2007-05-27 15:36 ` Nicolas Pitre
2007-05-29  0:09 linux
2007-05-29  3:26 ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).