git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Junio C Hamano <gitster@pobox.com>
To: git@vger.kernel.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Subject: [PATCH 3/3] abbrev: auto size the default abbreviation
Date: Fri, 30 Sep 2016 17:19:37 -0700	[thread overview]
Message-ID: <20161001001937.10884-4-gitster@pobox.com> (raw)
In-Reply-To: <20161001001937.10884-1-gitster@pobox.com>

From: Linus Torvalds <torvalds@linux-foundation.org>

In fairly early days we somehow decided to abbreviate object names
down to 7-hexdigits, but as projects grow, it is becoming more and
more likely to see such a short object names made in earlier days
and recorded in the log messages no longer unique.

Currently the Linux kernel project needs 11 to 12 hexdigits, while
Git itself needs 10 hexdigits to uniquely identify the objects they
have, while many smaller projects may still be fine with the
original 7-hexdigit default.  One-size does not fit all projects.

Introduce a mechanism, where we estimate the number of objects in
the repository upon the first request to abbreviate an object name
with the default setting and come up with a sane default for the
repository.  Based on the expectation that we would see collision in
a repository with 2^(2N) objects when using object names shortened
to first N bits, use sufficient number of hexdigits to cover the
number of objects in the repository.  Each hexdigit (4-bits) we add
to the shortened name allows us to have four times (2-bits) as many
objects in the repository.

---
 cache.h       |  1 +
 environment.c |  2 +-
 sha1_name.c   | 28 +++++++++++++++++++++++++++-
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/cache.h b/cache.h
index 5a651b8435..0e2a0595e5 100644
--- a/cache.h
+++ b/cache.h
@@ -1204,6 +1204,7 @@ struct object_context {
 #define GET_SHA1_TREEISH          020
 #define GET_SHA1_BLOB             040
 #define GET_SHA1_FOLLOW_SYMLINKS 0100
+#define GET_SHA1_AUTOMATIC	 0200
 #define GET_SHA1_ONLY_TO_DIE    04000
 
 #define GET_SHA1_DISAMBIGUATORS \
diff --git a/environment.c b/environment.c
index 44fb107b8a..6f9d290563 100644
--- a/environment.c
+++ b/environment.c
@@ -16,7 +16,7 @@ int trust_executable_bit = 1;
 int trust_ctime = 1;
 int check_stat = 1;
 int has_symlinks = 1;
-int minimum_abbrev = 4, default_abbrev = FALLBACK_DEFAULT_ABBREV;
+int minimum_abbrev = 4, default_abbrev = -1;
 int ignore_case;
 int assume_unchanged;
 int prefer_symlink_refs;
diff --git a/sha1_name.c b/sha1_name.c
index 3b647fd7cf..beb7ab588b 100644
--- a/sha1_name.c
+++ b/sha1_name.c
@@ -15,6 +15,7 @@ typedef int (*disambiguate_hint_fn)(const unsigned char *, void *);
 
 struct disambiguate_state {
 	int len; /* length of prefix in hex chars */
+	unsigned int nrobjects;
 	char hex_pfx[GIT_SHA1_HEXSZ + 1];
 	unsigned char bin_pfx[GIT_SHA1_RAWSZ];
 
@@ -118,6 +119,14 @@ static void find_short_object_filename(struct disambiguate_state *ds)
 
 			if (strlen(de->d_name) != 38)
 				continue;
+
+			/*
+			 * We only look at the one subdirectory, and we assume
+			 * each subdirectory is roughly similar, so each
+			 * object we find probably has 255 other objects in
+			 * the other fan-out directories.
+			 */
+			ds->nrobjects += 256;
 			if (memcmp(de->d_name, ds->hex_pfx + 2, ds->len - 2))
 				continue;
 			memcpy(hex + 2, de->d_name, 38);
@@ -151,6 +160,7 @@ static void unique_in_pack(struct packed_git *p,
 
 	open_pack_index(p);
 	num = p->num_objects;
+	ds->nrobjects += num;
 	last = num;
 	while (first < last) {
 		uint32_t mid = (first + last) / 2;
@@ -380,6 +390,9 @@ static int show_ambiguous_object(const unsigned char *sha1, void *data)
 	return 0;
 }
 
+/* start from our historical default before the automatic abbreviation */
+static int default_automatic_abbrev = FALLBACK_DEFAULT_ABBREV;
+
 static int get_short_sha1(const char *name, int len, unsigned char *sha1,
 			  unsigned flags)
 {
@@ -426,6 +439,14 @@ static int get_short_sha1(const char *name, int len, unsigned char *sha1,
 		for_each_abbrev(ds.hex_pfx, show_ambiguous_object, &ds);
 	}
 
+	if (len < 16 && !status && (flags & GET_SHA1_AUTOMATIC)) {
+		unsigned int expect_collision = 1 << (len * 2);
+		if (ds.nrobjects > expect_collision) {
+			default_automatic_abbrev = len+1;
+			return SHORT_NAME_AMBIGUOUS;
+		}
+	}
+
 	return status;
 }
 
@@ -458,14 +479,19 @@ int for_each_abbrev(const char *prefix, each_abbrev_fn fn, void *cb_data)
 int find_unique_abbrev_r(char *hex, const unsigned char *sha1, int len)
 {
 	int status, exists;
+	int flags = GET_SHA1_QUIETLY;
 
+	if (len < 0) {
+		flags |= GET_SHA1_AUTOMATIC;
+		len = default_automatic_abbrev;
+	}
 	sha1_to_hex_r(hex, sha1);
 	if (len == 40 || !len)
 		return 40;
 	exists = has_sha1_file(sha1);
 	while (len < 40) {
 		unsigned char sha1_ret[20];
-		status = get_short_sha1(hex, len, sha1_ret, GET_SHA1_QUIETLY);
+		status = get_short_sha1(hex, len, sha1_ret, flags);
 		if (exists
 		    ? !status
 		    : status == SHORT_NAME_NOT_FOUND) {
-- 
2.10.0-622-g05f606bbb0


  parent reply	other threads:[~2016-10-01  0:20 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-01  0:19 [PATCH 0/3] auto-sizing default abbreviation length Junio C Hamano
2016-10-01  0:19 ` [PATCH 1/3] abbrev: add FALLBACK_DEFAULT_ABBREV to prepare for auto sizing Junio C Hamano
2016-10-01  0:19 ` [PATCH 2/3] abbrev: prepare for new world order Junio C Hamano
2016-10-01  0:19 ` Junio C Hamano [this message]
2016-10-03 22:27   ` [PATCH 3/3] abbrev: auto size the default abbreviation Jeff King
2016-10-03 22:34     ` Linus Torvalds
2016-10-03 22:40       ` Jeff King
2016-10-03 22:52         ` Junio C Hamano
2016-10-03 23:47           ` Jeff King
2016-10-04  1:37             ` Junio C Hamano
2016-10-04 12:18               ` Jeff King
2016-11-02  1:33     ` Junio C Hamano
2016-11-02  2:12       ` Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161001001937.10884-4-gitster@pobox.com \
    --to=gitster@pobox.com \
    --cc=git@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).