git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 3/3] git-fetch: avoid local fetching from alternate (again)
@ 2007-11-08  8:00 Shawn O. Pearce
  2007-11-08 10:00 ` Shawn O. Pearce
  0 siblings, 1 reply; 5+ messages in thread
From: Shawn O. Pearce @ 2007-11-08  8:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Back in e3c6f240fd9c5bdeb33f2d47adc859f37935e2df Junio taught
git-fetch to avoid copying objects when we are fetching from
a repository that is already registered as an alternate object
database.  In such a case there is no reason to copy any objects
as we can already obtain them through the alternate.

However we need to ensure the objects are all reachable, so we
run `git rev-list --objects $theirs --not --all` to verify this.
If any object is missing or unreadable then we need to fetch/copy
the objects from the remote.  When a missing object is detected
the git-rev-list process will exit with a non-zero exit status,
making this condition quite easy to detect.

Although git-fetch is currently a builtin (and so is rev-list)
we cannot invoke the traverse_objects() API at this point in the
transport code.  The object walker within traverse_objects() calls
die() as soon as it finds an object it cannot read.  If that happens
we want to resume the fetch process by calling do_fetch_pack().
To get aroaund this we spawn git-rev-list into a background process
to prevent a die() from killing the foreground fetch process,
thus allowing the fetch process to resume into do_fetch_pack()
if copying is necessary.

We aren't interested in the output of rev-list (a list of SHA-1
object names that are reachable) or its errors (a "spurious" error
about an object not being found as we need to copy it) so we redirect
both stdout and stderr to /dev/null.

We run this git-rev-list based check before any fetch as we may
already have the necessary objects local from a prior fetch.  If we
don't then its very likely the first $theirs object listed on the
command line won't exist locally and git-rev-list will die very
quickly, allowing us to start the network transfer.  This test even
on remote URLs may save bandwidth if someone runs `git pull origin`,
sees a merge conflict, resets out, then redoes the same pull just
a short time later.  If the remote hasn't changed between the two
pulls and the local repository hasn't had git-gc run in it then
there is probably no need to perform network transfer as all of
the objects are local.

Documentation for the new fetch_local_nocopy function was suggested
and written by Junio, based on his original comment in git-fetch.sh.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 transport.c |   68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/transport.c b/transport.c
index f4577b7..d37864e 100644
--- a/transport.c
+++ b/transport.c
@@ -615,17 +615,76 @@ static struct ref *get_refs_via_connect(struct transport *transport)
 	return refs;
 }
 
+/*
+ * We would want to bypass the object transfer altogether if
+ * everything we are going to fetch already exists and connected
+ * locally.
+ *
+ * The refs we are going to fetch are in to_fetch (nr_heads in
+ * total).  If running
+ *
+ *  $ git-rev-list --objects to_fetch[0] to_fetch[1] ... --not --all
+ *
+ * does not error out, that means everything reachable from the
+ * refs we are going to fetch exists and is connected to some of
+ * our existing refs.
+ */
+static int fetch_local_nocopy(struct transport *transport,
+			       int nr_heads, struct ref **to_fetch)
+{
+	struct git_transport_data *data = transport->data;
+	struct child_process revlist;
+	char **argv;
+	int i, j, err;
+
+	/*
+	 * If we are deepening a shallow clone we already have these
+	 * objects reachable.  Running rev-list here will return with
+	 * a good (0) exit status and we'll bypass the fetch that we
+	 * really need to perform.  Claiming failure now will ensure
+	 * we perform the network exchange to deepen our history.
+	 */
+	if (data->depth)
+		return -1;
+
+	i = 0;
+	argv = xmalloc(sizeof(*argv) * (nr_heads + 6));
+	argv[i++] = xstrdup("rev-list");
+	argv[i++] = xstrdup("--no-output");
+	argv[i++] = xstrdup("--objects");
+	for (j = 0; j < nr_heads; j++)
+		argv[i++] = xstrdup(sha1_to_hex(to_fetch[j]->old_sha1));
+	argv[i++] = xstrdup("--not");
+	argv[i++] = xstrdup("--all");
+	argv[i++] = NULL;
+
+	memset(&revlist, 0, sizeof(revlist));
+	revlist.argv = (const char**)argv;
+	revlist.git_cmd = 1;
+	revlist.no_stdin = 1;
+	revlist.no_stdout = 1;
+	revlist.no_stderr = 1;
+	err = run_command(&revlist);
+
+	for (i = 0; argv[i]; i++)
+		free(argv[i]);
+	free(argv);
+	return err;
+}
+
 static int fetch_refs_via_pack(struct transport *transport,
 			       int nr_heads, struct ref **to_fetch)
 {
 	struct git_transport_data *data = transport->data;
-	char **heads = xmalloc(nr_heads * sizeof(*heads));
-	char **origh = xmalloc(nr_heads * sizeof(*origh));
+	char **heads, **origh;
 	struct ref *refs;
-	char *dest = xstrdup(transport->url);
+	char *dest;
 	struct fetch_pack_args args;
 	int i;
 
+	if (!fetch_local_nocopy(transport, nr_heads, to_fetch))
+		return 0;
+
 	memset(&args, 0, sizeof(args));
 	args.uploadpack = data->uploadpack;
 	args.keep_pack = data->keep;
@@ -634,6 +693,9 @@ static int fetch_refs_via_pack(struct transport *transport,
 	args.verbose = transport->verbose > 0;
 	args.depth = data->depth;
 
+	heads = xmalloc(nr_heads * sizeof(*heads));
+	origh = xmalloc(nr_heads * sizeof(*origh));
+	dest = xstrdup(transport->url);
 	for (i = 0; i < nr_heads; i++)
 		origh[i] = heads[i] = xstrdup(to_fetch[i]->name);
 	refs = fetch_pack(&args, dest, nr_heads, heads, &transport->pack_lockfile);
-- 
1.5.3.5.1590.gfadfad

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] git-fetch: avoid local fetching from alternate (again)
  2007-11-08  8:00 [PATCH 3/3] git-fetch: avoid local fetching from alternate (again) Shawn O. Pearce
@ 2007-11-08 10:00 ` Shawn O. Pearce
  2007-11-08 10:07   ` Junio C Hamano
  0 siblings, 1 reply; 5+ messages in thread
From: Shawn O. Pearce @ 2007-11-08 10:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

"Shawn O. Pearce" <spearce@spearce.org> wrote:
> Back in e3c6f240fd9c5bdeb33f2d47adc859f37935e2df Junio taught
> git-fetch to avoid copying objects when we are fetching from
> a repository that is already registered as an alternate object
> database.  In such a case there is no reason to copy any objects
> as we can already obtain them through the alternate.

I haven't figured it out yet but this patch seriously breaks
t5515-fetch-merge-logic.  For some reason the tag tag-three-file is
not being included in .git/FETCH_HEAD as a not-for-merge ref, but all
of the test vectors are expecting it to be present.  Prior to this
patch it was included and I don't think the test vectors are wrong.

If I run git-fetch from outside the test library it does the right
thing and fetches this annotated tag pointing to a blob just fine.
But during the test vector it never even mentions that tag as part
of the status output, nor does it include it into .git/FETCH_HEAD.
Its almost like the tag ain't there.

I'm starting to suspect heap corruption again in builtin-fetch.
This patch alters the malloc() calls we are doing and may be shifting
something around just enough in memory to cause a data overwrite or
something and that's why this tag just drops out of the linked list?
But then why does that happen in the test suite but not outside.
Maybe because the test suite is setting environment variables that
I'm not and the impact of those combined with these additional
mallocs is what is breaking it?  *sigh*

-- 
Shawn.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] git-fetch: avoid local fetching from alternate (again)
  2007-11-08 10:00 ` Shawn O. Pearce
@ 2007-11-08 10:07   ` Junio C Hamano
  2007-11-08 11:22     ` Shawn O. Pearce
  0 siblings, 1 reply; 5+ messages in thread
From: Junio C Hamano @ 2007-11-08 10:07 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git

"Shawn O. Pearce" <spearce@spearce.org> writes:

> I'm starting to suspect heap corruption again in builtin-fetch.
> This patch alters the malloc() calls we are doing and may be shifting
> something around just enough in memory to cause a data overwrite or
> something and that's why this tag just drops out of the linked list?
> But then why does that happen in the test suite but not outside.
> Maybe because the test suite is setting environment variables that
> I'm not and the impact of those combined with these additional
> mallocs is what is breaking it?  *sigh*

Thanks for working hard on this one.

It is starting to look like today was "let's kill other people's
bugs" day.  I'd go to bed before I completely miss sleep, which
I often end up doing when thinking too long about bugs.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] git-fetch: avoid local fetching from alternate (again)
  2007-11-08 10:07   ` Junio C Hamano
@ 2007-11-08 11:22     ` Shawn O. Pearce
  2007-11-08 22:27       ` Shawn O. Pearce
  0 siblings, 1 reply; 5+ messages in thread
From: Shawn O. Pearce @ 2007-11-08 11:22 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano <gitster@pobox.com> wrote:
> "Shawn O. Pearce" <spearce@spearce.org> writes:
> 
> > I'm starting to suspect heap corruption again in builtin-fetch.
> > This patch alters the malloc() calls we are doing and may be shifting
> > something around just enough in memory to cause a data overwrite or
> > something and that's why this tag just drops out of the linked list?
> > But then why does that happen in the test suite but not outside.
> > Maybe because the test suite is setting environment variables that
> > I'm not and the impact of those combined with these additional
> > mallocs is what is breaking it?  *sigh*

Found it.  This ain't pretty.  Remember, what's failing in the test
suite is we aren't getting "tag-three-file" automatically followed
during fetch.  This is an annotated tag referring to a blob.

Here's the *WAY WRONG FIX THAT SHOULD NOT EVER BE APPLIED*:

diff --git a/builtin-fetch.c b/builtin-fetch.c
index 847db73..a935b5a 100644
--- a/builtin-fetch.c
+++ b/builtin-fetch.c
@@ -389,7 +389,7 @@ static struct ref *find_non_local_tags(struct transport *transport,
 
 		if (!path_list_has_path(&existing_refs, ref_name) &&
 		    !path_list_has_path(&new_refs, ref_name) &&
-		    lookup_object(ref->old_sha1)) {
+		    has_sha1_file(ref->old_sha1)) {
 			path_list_insert(ref_name, &new_refs);
 
 			rm = alloc_ref(strlen(ref_name) + 1);

(I know Junio knows why this patch shouldn't be applied.
 Its exactly why quickfetch calls rev-list --objects...)

The problem here is my quickfetch patch is bypassing the internal
call to fetch-pack.  Which means we never do object handshaking,
and thus never allocate struct object* for the things we had been
talking about (because we never talked about them!).  Or in the
case of commits, their trees and parents too.  Thus the call above
to lookup_object() for a blob fails, as we never tried to parse it
before in fetch-pack.

Now I'm really not sure why we have blob objects allocated into
memory for lookup_object() during fetch-pack, but apparently we do.
At least during this test vector.  It doesn't make sense if we are
only talking about commits (and their parents).  I'd expect the
root trees to have objects (when the commit buffer was parsed for
graph traversal) but not blobs.

Maybe some smart individual will come up with the right solution to
keep automatic tag following enabled (safely!) while also allowing
us to use quickfetch.  There's got to be a solution that doesn't
implicitly rely upon the handshaking we just happened to do with
the remote peer...

Me, I'm off to catch a few hours of sleep.

-- 
Shawn.

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] git-fetch: avoid local fetching from alternate (again)
  2007-11-08 11:22     ` Shawn O. Pearce
@ 2007-11-08 22:27       ` Shawn O. Pearce
  0 siblings, 0 replies; 5+ messages in thread
From: Shawn O. Pearce @ 2007-11-08 22:27 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

"Shawn O. Pearce" <spearce@spearce.org> wrote:
> Junio C Hamano <gitster@pobox.com> wrote:
> > "Shawn O. Pearce" <spearce@spearce.org> writes:
> > 
> > > I'm starting to suspect heap corruption again in builtin-fetch.
> > > This patch alters the malloc() calls we are doing and may be shifting
> > > something around just enough in memory to cause a data overwrite or
> > > something and that's why this tag just drops out of the linked list?
> > > But then why does that happen in the test suite but not outside.
> > > Maybe because the test suite is setting environment variables that
> > > I'm not and the impact of those combined with these additional
> > > mallocs is what is breaking it?  *sigh*
> 
> Found it.  This ain't pretty.  Remember, what's failing in the test
> suite is we aren't getting "tag-three-file" automatically followed
> during fetch.  This is an annotated tag referring to a blob.

Here's one possible way of fixing this.  I think what we want is to
include "--not --all" when we run the object listing immediately
after the fetch so we mark *only* those objects we just fetched
and ignore the ones we already had reachable through existing refs.
This makes the walking cost proportional to the size of the fetch
and not the size of the object database.

Unfortunately when you insert "--not" in front of "--all" in the args
array below the test vectors all fail again.  Apparently we already
have the blob that "tag-three-file" refers to so its not something
we walk over during this process and the tag isn't followed.
That happens because the test repository we are fetching into
already has a number of objects through its refs/remotes/origin/*
namespace created by git-clone and the blob is already considered
to be reachable.

I'm not formatting this as a real patch because I'm not yet sure this
is the right way to fix the automatic tag following code.  Or the
right way to abuse the revision walking machinary within git-fetch
to implement such a feature.  Comments would be most appreciated.  :)


--8>--
diff --git a/builtin-fetch.c b/builtin-fetch.c
index 847db73..77f1901 100644
--- a/builtin-fetch.c
+++ b/builtin-fetch.c
@@ -8,6 +8,9 @@
 #include "path-list.h"
 #include "remote.h"
 #include "transport.h"
+#include "diff.h"
+#include "list-objects.h"
+#include "revision.h"
 
 static const char fetch_usage[] = "git-fetch [-a | --append] [--upload-pack <upload-pack>] [-f | --force] [--no-tags] [-t | --tags] [-k | --keep] [-u | --update-head-ok] [--depth <depth>] [-v | --verbose] [<repository> <refspec>...]";
 
@@ -335,11 +338,43 @@ static void store_updated_refs(const char *url, struct ref *ref_map)
 	fclose(fp);
 }
 
+static void mark_just_fetched_commit(struct commit *commit)
+{
+}
+
+static void mark_just_fetched_object(struct object_array_entry *p)
+{
+	if (p->item->type == OBJ_BLOB && !has_sha1_file(p->item->sha1))
+		die("missing blob object '%s'", sha1_to_hex(p->item->sha1));
+}
+
 static int fetch_refs(struct transport *transport, struct ref *ref_map)
 {
 	int ret = transport_fetch_refs(transport, ref_map);
-	if (!ret)
+	if (!ret) {
+		const char *args[] = { "rev-list", "--objects", "--all", NULL};
+		struct rev_info revs;
+		struct ref *ref;
+
+		init_revisions(&revs, NULL);
+		setup_revisions(ARRAY_SIZE(args) - 1, args, &revs, NULL);
+		save_commit_buffer = 0;
+		track_object_refs = 0;
+
+		for (ref = ref_map; ref; ref = ref->next) {
+			struct object *o = parse_object(ref->old_sha1);
+			if (o)
+				add_pending_object(&revs, o, "(just-fetched)");
+		}
+
+		prepare_revision_walk(&revs);
+		mark_edges_uninteresting(revs.commits, &revs, NULL);
+		traverse_commit_list(&revs,
+			mark_just_fetched_commit,
+			mark_just_fetched_object);
+
 		store_updated_refs(transport->url, ref_map);
+	}
 	transport_unlock_pack(transport);
 	return ret;
 }
@@ -365,6 +400,7 @@ static struct ref *find_non_local_tags(struct transport *transport,
 	struct ref *ref_map = NULL;
 	struct ref **tail = &ref_map;
 	const struct ref *ref;
+	struct object *obj;
 
 	for_each_ref(add_existing, &existing_refs);
 	for (ref = transport_get_remote_refs(transport); ref; ref = ref->next) {
@@ -389,7 +425,8 @@ static struct ref *find_non_local_tags(struct transport *transport,
 
 		if (!path_list_has_path(&existing_refs, ref_name) &&
 		    !path_list_has_path(&new_refs, ref_name) &&
-		    lookup_object(ref->old_sha1)) {
+		    (obj = lookup_object(ref->old_sha1)) &&
+		    obj->flags & SEEN) {
 			path_list_insert(ref_name, &new_refs);
 
 			rm = alloc_ref(strlen(ref_name) + 1);
diff --git a/list-objects.c b/list-objects.c
index e5c88c2..78bf04c 100644
--- a/list-objects.c
+++ b/list-objects.c
@@ -170,4 +170,5 @@ void traverse_commit_list(struct rev_info *revs,
 	}
 	for (i = 0; i < objects.nr; i++)
 		show_object(&objects.objects[i]);
+	free(objects.objects);
 }

-- 
Shawn.

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2007-11-08 22:27 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-08  8:00 [PATCH 3/3] git-fetch: avoid local fetching from alternate (again) Shawn O. Pearce
2007-11-08 10:00 ` Shawn O. Pearce
2007-11-08 10:07   ` Junio C Hamano
2007-11-08 11:22     ` Shawn O. Pearce
2007-11-08 22:27       ` Shawn O. Pearce

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).