git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/"
@ 2009-01-07  7:43 Christian Couder
  2009-01-07  8:41 ` Junio C Hamano
  2009-01-07 12:29 ` Johannes Schindelin
  0 siblings, 2 replies; 7+ messages in thread
From: Christian Couder @ 2009-01-07  7:43 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

The code of this mechanism has been copied from the commit graft code.

Currently this mechanism is only used from the "parse_commit_buffer"
function in "commit.c". It should probably be used from "fsck.c" too.

(For information, grafts are looked up only from "parse_commit_buffer"
function in "commit.c" and from "fsck_commit" in "fsck.c".)

In "parse_commit_buffer", the parent sha1s from the original commit
or from a commit graft that match a ref name in "refs/replace/" are
replaced by the commit sha1 that has been read in the ref.

This means that for example "git show <original commit sha1>" will
display information about the original commit. If the mechanism
had been called from "read_sha1_file" instead of when parents
are read, then "git show <original commit sha1>" would display
information about the commit that replaces the original one.
This may be seen as a feature or as a bug depending on the point
of view.

Anyway this implementation makes sure that the mechanism is
triggered only when commit graft could be triggered, so hopefully the
object reachability traverser will ignore this mechanism as it
ignores the graft mechanism.

Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
---
 Makefile         |    1 +
 commit.c         |    7 +++-
 commit.h         |    2 +
 replace_object.c |  102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 2 deletions(-)
 create mode 100644 replace_object.c

diff --git a/Makefile b/Makefile
index aabf013..f355e63 100644
--- a/Makefile
+++ b/Makefile
@@ -471,6 +471,7 @@ LIB_OBJS += read-cache.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
+LIB_OBJS += replace_object.o
 LIB_OBJS += rerere.o
 LIB_OBJS += revision.o
 LIB_OBJS += run-command.o
diff --git a/commit.c b/commit.c
index c99db16..0014174 100644
--- a/commit.c
+++ b/commit.c
@@ -241,6 +241,7 @@ int parse_commit_buffer(struct commit *item, void *buffer, unsigned long size)
 	char *tail = buffer;
 	char *bufptr = buffer;
 	unsigned char parent[20];
+	const unsigned char *parent_sha1;
 	struct commit_list **pptr;
 	struct commit_graft *graft;
 
@@ -268,7 +269,8 @@ int parse_commit_buffer(struct commit *item, void *buffer, unsigned long size)
 		bufptr += 48;
 		if (graft)
 			continue;
-		new_parent = lookup_commit(parent);
+		parent_sha1 = lookup_replace_object(parent);
+		new_parent = lookup_commit(parent_sha1);
 		if (new_parent)
 			pptr = &commit_list_insert(new_parent, pptr)->next;
 	}
@@ -276,7 +278,8 @@ int parse_commit_buffer(struct commit *item, void *buffer, unsigned long size)
 		int i;
 		struct commit *new_parent;
 		for (i = 0; i < graft->nr_parent; i++) {
-			new_parent = lookup_commit(graft->parent[i]);
+			parent_sha1 = lookup_replace_object(graft->parent[i]);
+			new_parent = lookup_commit(parent_sha1);
 			if (!new_parent)
 				continue;
 			pptr = &commit_list_insert(new_parent, pptr)->next;
diff --git a/commit.h b/commit.h
index 3a7b06a..37aa763 100644
--- a/commit.h
+++ b/commit.h
@@ -122,6 +122,8 @@ struct commit_graft *read_graft_line(char *buf, int len);
 int register_commit_graft(struct commit_graft *, int);
 struct commit_graft *lookup_commit_graft(const unsigned char *sha1);
 
+const unsigned char *lookup_replace_object(const unsigned char *sha1);
+
 extern struct commit_list *get_merge_bases(struct commit *rev1, struct commit *rev2, int cleanup);
 extern struct commit_list *get_merge_bases_many(struct commit *one, int n, struct commit **twos, int cleanup);
 extern struct commit_list *get_octopus_merge_bases(struct commit_list *in);
diff --git a/replace_object.c b/replace_object.c
new file mode 100644
index 0000000..b50890d
--- /dev/null
+++ b/replace_object.c
@@ -0,0 +1,102 @@
+#include "cache.h"
+#include "refs.h"
+
+static struct replace_object {
+	unsigned char sha1[2][20];
+} **replace_object;
+
+static int replace_object_alloc, replace_object_nr;
+
+static int replace_object_pos(const unsigned char *sha1)
+{
+	int lo, hi;
+	lo = 0;
+	hi = replace_object_nr;
+	while (lo < hi) {
+		int mi = (lo + hi) / 2;
+		struct replace_object *rep = replace_object[mi];
+		int cmp = hashcmp(sha1, rep->sha1[0]);
+		if (!cmp)
+			return mi;
+		if (cmp < 0)
+			hi = mi;
+		else
+			lo = mi + 1;
+	}
+	return -lo - 1;
+}
+
+static int register_replace_object(struct replace_object *replace,
+				   int ignore_dups)
+{
+	int pos = replace_object_pos(replace->sha1[0]);
+
+	if (0 <= pos) {
+		if (ignore_dups)
+			free(replace);
+		else {
+			free(replace_object[pos]);
+			replace_object[pos] = replace;
+		}
+		return 1;
+	}
+	pos = -pos - 1;
+	if (replace_object_alloc <= ++replace_object_nr) {
+		replace_object_alloc = alloc_nr(replace_object_alloc);
+		replace_object = xrealloc(replace_object,
+					  sizeof(*replace_object) *
+					  replace_object_alloc);
+	}
+	if (pos < replace_object_nr)
+		memmove(replace_object + pos + 1,
+			replace_object + pos,
+			(replace_object_nr - pos - 1) *
+			sizeof(*replace_object));
+	replace_object[pos] = replace;
+	return 0;
+}
+
+static int register_replace_ref(const char *refname,
+				const unsigned char *sha1,
+				int flag, void *cb_data)
+{
+	/* Get sha1 from refname */
+	const char *slash = strrchr(refname, '/');
+	const char *hash = slash ? slash + 1 : refname;
+	struct replace_object * repl_obj = xmalloc(sizeof(*repl_obj));
+
+	if (strlen(hash) != 40 || get_sha1_hex(hash, repl_obj->sha1[0])) {
+		free(repl_obj);
+		warning("bad replace ref name: %s", refname);
+	}
+
+	/* Copy sha1 from the read ref */
+	hashcpy(repl_obj->sha1[1], sha1);
+
+	/* Register new object */
+	if (register_replace_object(repl_obj, 1))
+		warning("duplicate replace ref: %s", refname);
+
+	return 0;
+}
+
+static void prepare_replace_object(void)
+{
+	static int replace_object_prepared;
+
+	if (replace_object_prepared)
+		return;
+
+	for_each_replace_ref(register_replace_ref, NULL);
+	replace_object_prepared = 1;
+}
+
+const unsigned char *lookup_replace_object(const unsigned char *sha1)
+{
+	int pos;
+
+	prepare_replace_object();
+	pos = replace_object_pos(sha1);
+
+	return (0 <= pos) ? replace_object[pos]->sha1[1] : sha1;
+}
-- 
1.6.1.162.g1cd53

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/"
  2009-01-07  7:43 [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/" Christian Couder
@ 2009-01-07  8:41 ` Junio C Hamano
  2009-01-08 17:31   ` Christian Couder
  2009-01-07 12:29 ` Johannes Schindelin
  1 sibling, 1 reply; 7+ messages in thread
From: Junio C Hamano @ 2009-01-07  8:41 UTC (permalink / raw)
  To: Christian Couder; +Cc: git

Christian Couder <chriscool@tuxfamily.org> writes:

> In "parse_commit_buffer", the parent sha1s from the original commit
> or from a commit graft that match a ref name in "refs/replace/" are
> replaced by the commit sha1 that has been read in the ref.

I may be reading this wrong, but if you replace the parent of the commit,
that won't improve anything over the graft mechanism, which we already
have.

What I was hoping to see was to give replacing commit back when a commit
is asked for by normal callers, while not replacing the objects when
reachability traversal (prune, pack transfer and fsck) wants to read the
commit.  That way:

 (1) Normal callers will see the rewritten history (which is the same as
     graft); and

 (2) We will kill the design bug of the current graft mechanism that graft
     information cannot be transferred to the other end.  By using object
     replacement, you can fetch objects reachable from refs/replace/ at
     the other end and place them in the same refs/replace/ hierarchy
     locally, if you choose to use the same altered history, or you can
     choose not to use the replacement yourself.  The resulting repository
     is fully connected either way.

 (3) We will also kill the design bug of the current graft mechanism that
     graft information hides repository corruption to fsck.  The problem
     with this is that if you and then remove the grafts, you will end up
     with a corrupt repository, because these operations do look at grafts
     (this is justified only partly because otherwise you will lose
     objects that are reachable only via grafts, but is broken at the same
     time because you can lose the true parents by letting graft hide them
     from a reachable commit).

     By doing fsck and prune always using the true reachability, and using
     refs in the refs/replace/ hierarchy for protecting objects that are
     involved in this new way of grafting, you will ensure the integrity
     of the repository.  Removal of a ref from the refs/replace/ hierarchy
     won't result in such a corruption we can have today.

And that is exactly the reason why I was hoping the hook will be at
read_sha1_file() that gives you a rewritten object contents when you ask
for the original object, not at parse_commit_buffer which does not give
you a rewritten object, but makes you follow to a rewritten object when
parsing a commit (which itself is not replaced if it is the starting
point).

Doing replacement at parse_commit_buffer() time is one step too late.  For
example, if you have replacement information for the commit that sits at
the tip of 'master' branch, I think your "git log master" will ignore that
replacement information.  Creating one new commit on top of it and saying
"git log master" then will show the new commit, and suddenly starts
showing the replacement commit for the one you used to have at the tip of
the branch.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/"
  2009-01-07  7:43 [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/" Christian Couder
  2009-01-07  8:41 ` Junio C Hamano
@ 2009-01-07 12:29 ` Johannes Schindelin
  1 sibling, 0 replies; 7+ messages in thread
From: Johannes Schindelin @ 2009-01-07 12:29 UTC (permalink / raw)
  To: Christian Couder; +Cc: Junio C Hamano, git

Hi,

On Wed, 7 Jan 2009, Christian Couder wrote:

> diff --git a/replace_object.c b/replace_object.c
> new file mode 100644
> index 0000000..b50890d
> --- /dev/null
> +++ b/replace_object.c
> @@ -0,0 +1,102 @@
> +#include "cache.h"
> +#include "refs.h"
> +
> +static struct replace_object {
> +	unsigned char sha1[2][20];
> +} **replace_object;
> +
> +static int replace_object_alloc, replace_object_nr;
> +
> +static int replace_object_pos(const unsigned char *sha1)
> +{
> +	int lo, hi;
> +	lo = 0;
> +	hi = replace_object_nr;
> +	while (lo < hi) {
> [...]

I suspect that this sorted list should be a hashmap.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/"
  2009-01-07  8:41 ` Junio C Hamano
@ 2009-01-08 17:31   ` Christian Couder
  2009-01-08 23:55     ` Junio C Hamano
  0 siblings, 1 reply; 7+ messages in thread
From: Christian Couder @ 2009-01-08 17:31 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Le mercredi 7 janvier 2009, Junio C Hamano a écrit :
> Christian Couder <chriscool@tuxfamily.org> writes:
> > In "parse_commit_buffer", the parent sha1s from the original commit
> > or from a commit graft that match a ref name in "refs/replace/" are
> > replaced by the commit sha1 that has been read in the ref.
>
> I may be reading this wrong, but if you replace the parent of the commit,
> that won't improve anything over the graft mechanism, which we already
> have.
>
> What I was hoping to see was to give replacing commit back when a commit
> is asked for by normal callers, 

Yeah, but read_sha1_file is called to read all object files, not just 
commits. So putting the hook there will:

	1) add a lookup overhead when reading any object,
	2) make it possible to replace any object,

And there is also the following problem:

	3) this function is often called like this:

	buffer = read_sha1_file(sha1, &type, &size);
	if (!buffer)
		die("Cannot read %s", sha1_to_hex(sha1));

	so in case of error, it will give an error message with a bad sha1
	in it because the sha1 of the file that we cannot read is the sha1
	in the replace ref not the one passed to read_sha1_file.

To avoid the above problems, maybe we can try to also improve what 
read_sha1_file does:

1) allow callers to pass a type in the "type" argument and only lookup in 
the replace refs if we say we want a commit, but this makes calling this 
function more error prone
2) when we say we want an object with a given type, check if the object we 
read has this type (and die if not)
3) die in read_sha1_file when there is an error and we are replacing so that 
callers don't need to die themself and so that we can always report an 
accurate sha1 in the error message

Something like this (totally untested):

void *read_sha1_file(const unsigned char *sha1, enum object_type *type,
		     unsigned long *size)
{
	void *data;
	enum object_type read_type;
	int replacing = (type && *type == OBJ_COMMIT);

	/* only replace commits when we ask for one */
	if (replacing)
		sha1 = lookup_replace_object(sha1);
	data = read_object(sha1, &read_type, size);
	/* legacy behavior is to die on corrupted objects */
	if (!data && (has_loose_object(sha1) || has_packed_and_bad(sha1)))
		die("object %s is corrupted", sha1_to_hex(sha1));
	if (type && *type != OBJ_NONE && *type != read_type)
		die("type of object %s (%d) is different from the type we want (%d)",
		    sha1_to_hex(sha1), read_type, *type);
	if (!data && replacing)
		die("Cannot read commit %s", sha1_to_hex(sha1));

	return data;
}

Or perhaps it is better to leave read_sha1_file as it is and add a new 
function like this:

void *read_commit(const unsigned char *sha1, unsigned long *size)
{
	void *data;
	enum object_type type;

	sha1 = lookup_replace_object(sha1);
	data = read_sha1_file(sha1, &type, size);

	if (*type != OBJ_COMMIT)
		die("read_commit: object %s is not a commit", sha1_to_hex(sha1));
	if (!data)
		die("read_commit: cannot read commit %s", sha1_to_hex(sha1));

	return data;
}

The latter seems better to me. But I haven't looked much at the 60 
read_sha1_file callers yet.

[...]

> Doing replacement at parse_commit_buffer() time is one step too late. 
> For example, if you have replacement information for the commit that sits
> at the tip of 'master' branch, I think your "git log master" will ignore
> that replacement information.  Creating one new commit on top of it and
> saying "git log master" then will show the new commit, and suddenly
> starts showing the replacement commit for the one you used to have at the
> tip of the branch.

Yeah, I agree there are drawbacks to do it at parse_commit_buffer time.

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/"
  2009-01-08 17:31   ` Christian Couder
@ 2009-01-08 23:55     ` Junio C Hamano
  2009-01-10 16:30       ` Jakub Narebski
  0 siblings, 1 reply; 7+ messages in thread
From: Junio C Hamano @ 2009-01-08 23:55 UTC (permalink / raw)
  To: Christian Couder; +Cc: git

Christian Couder <chriscool@tuxfamily.org> writes:

> Yeah, but read_sha1_file is called to read all object files, not just 
> commits. So putting the hook there will:
>
> 	1) add a lookup overhead when reading any object,
> 	2) make it possible to replace any object,

I actually see (2) as an improvement, and (1) as an associated cost.

> And there is also the following problem:
>
> 	3) this function is often called like this:
>
> 	buffer = read_sha1_file(sha1, &type, &size);
> 	if (!buffer)
> 		die("Cannot read %s", sha1_to_hex(sha1));
>
> 	so in case of error, it will give an error message with a bad sha1
> 	in it because the sha1 of the file that we cannot read is the sha1
> 	in the replace ref not the one passed to read_sha1_file.

You have refs/replace/$A that records $B, to tell git that the real object
$A in the history is replaced by another object $B.  The caller feeds $A
in the above snippet to read_sha1_file(), and your read_sha1_file()
notices that it needs to read $B instead, returns the buffer from the
object $B, and reports its type and size.  If there is no $B available, it
may return NULL and the caller says "I asked for $A but in this repository
I cannot get to it".  That sounds consistent to me, but I agree it would
be more helpful to report "and the reason why I cannot get to it is
because you have replacement defined as $B which you do not have."

> To avoid the above problems, maybe we can try to also improve what 
> read_sha1_file does:
>
> 1) allow callers to pass a type in the "type" argument and only lookup in 
> the replace refs if we say we want a commit, but this makes calling this 
> function more error prone

This is debatable, but can go either way.

> 2) when we say we want an object with a given type, check if the object we 
> read has this type (and die if not)

That we already do anyway, don't we?  parse_commit() gets data from
read_sha1_file() and would complain if it gets a blob, etc.

> 3) die in read_sha1_file when there is an error and we are replacing so that 
> callers don't need to die themself and so that we can always report an 
> accurate sha1 in the error message

I expect the use of graft and object replacement (or if you insist,
"commit replacement") rather rare, and I think it is probably Ok to
declare it a grave repository misconfiguration if somebody claims that $A
is replaced by $B without actually having $B:

	void *read_sha1_file(sha1, type, size)
        {
		void *data;
        	unsigned char *replacement = lookup_replace_object(sha1);
                if (replacement) {
                	data = read_sha1_file(replacement, type, size);
                        if (!data)
                        	die("replacement %s not found for %s",
                                    get_sha1_hex(replacement),
                                    get_sha1_hex(sha1));
		} else {
			data = read_object(sha1, type, size);
		}
                ... existing code ...
                return data;                
        }


To disable replacement for connectivity walkers, lookup_replace_object()
can look at a some global defined in environment.c, perhaps.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/"
  2009-01-08 23:55     ` Junio C Hamano
@ 2009-01-10 16:30       ` Jakub Narebski
       [not found]         ` <1231727868.6716.155.camel@vaio>
  0 siblings, 1 reply; 7+ messages in thread
From: Jakub Narebski @ 2009-01-10 16:30 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Christian Couder, git, Tim Ansell

Junio C Hamano <gitster@pobox.com> writes:
> Christian Couder <chriscool@tuxfamily.org> writes:
> 
> > Yeah, but read_sha1_file is called to read all object files, not just 
> > commits. So putting the hook there will:
> >
> > 	1) add a lookup overhead when reading any object,
> > 	2) make it possible to replace any object,
> 
> I actually see (2) as an improvement, and (1) as an associated cost.

I just had an idea: we can use this mechanism to better manage large
binary files in Git, by using replacements for _blobs_.

We want to be able to have two flavours of repository: one with large
blobs (media files usually), and one without.  We can use stubs in the
place of large binary files in 'no-megablobs' flavor, and add contents
of those files via refs/replace/* for _blobs_ in 'with-megablobs'
flavour.  We can control which objects we want to have, and which
objects to transfer.

What do you think about this (abuse of) an idea?
-- 
Jakub Narebski
Poland
ShadeHawk on #git

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/"
       [not found]         ` <1231727868.6716.155.camel@vaio>
@ 2009-01-12  9:50           ` Jakub Narebski
  0 siblings, 0 replies; 7+ messages in thread
From: Jakub Narebski @ 2009-01-12  9:50 UTC (permalink / raw)
  To: Tim Ansell; +Cc: git

On Mon, 12 Jan 2009, Tim 'mithro' Ansell wrote:

> > I just had an idea: we can use this mechanism to better manage large
> > binary files in Git, by using replacements for _blobs_.
> > 
> > We want to be able to have two flavours of repository: one with large
> > blobs (media files usually), and one without.  We can use stubs in the
> > place of large binary files in 'no-megablobs' flavor, and add contents
> > of those files via refs/replace/* for _blobs_ in 'with-megablobs'
> > flavour.  We can control which objects we want to have, and which
> > objects to transfer.
> > 
> > What do you think about this (abuse of) an idea?
> 
> I'm not sure I understand. I don't really care much about the
> implementation details if it's transparent to the user :)
> 
> I have not really had much time to pursue my idea of getting sha1_file
> to read download blob on an as-needed basis (work has been hectic).

Actually this idea is a bit different from lazy / on demand downloading
of large blobs.


In "on demand loading" solution you have sha-1 of _full_ blob in a tree,
but git knows that not having it is not a fatal error (somehow), and you
can ask git to download it. This is as far as I understand the solution
you proposed and partially implemented.

In "replacement blobs" solution, the (ab)use of object replacement
mechanism meant originally for easier bisect, you have sha-1 of _stub_
object in a tree.  If you want full (large) blob, you add replacement
in refs/replace/*, replacing stub blob object (must be unique; it can
for example contain some header + sha-1 of full object) with full
object.  If you want to have large object, you need to transfer 
refs/replace/*.  This solution means that user needs to be aware of
this mechanism, or have some wrapper (script) around it.


Different solution, and a bit different behavior wrt getting large
object for the user.

-- 
Jakub Narebski
Poland

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2009-01-12  9:52 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-07  7:43 [RFC/PATCH 2/3] replace_object: add mechanism to replace objects found in "refs/replace/" Christian Couder
2009-01-07  8:41 ` Junio C Hamano
2009-01-08 17:31   ` Christian Couder
2009-01-08 23:55     ` Junio C Hamano
2009-01-10 16:30       ` Jakub Narebski
     [not found]         ` <1231727868.6716.155.camel@vaio>
2009-01-12  9:50           ` Jakub Narebski
2009-01-07 12:29 ` Johannes Schindelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).