git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH 09/11] pack-check: do not unpack blobs
Date: Mon, 27 Feb 2012 14:55:13 +0700	[thread overview]
Message-ID: <1330329315-11407-10-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1330329315-11407-1-git-send-email-pclouds@gmail.com>

blob content is not used by verify_pack caller (currently only fsck),
we only need to make sure blob sha-1 signature matches its
content. unpack_entry() is taught to hash pack entry as it is
unpacked, eliminating the need to keep whole blob in memory.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 cache.h          |    2 +-
 fast-import.c    |    2 +-
 pack-check.c     |   21 ++++++++++++++++++++-
 sha1_file.c      |   45 +++++++++++++++++++++++++++++++++++----------
 t/t1050-large.sh |    2 +-
 5 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/cache.h b/cache.h
index 6ce691b..33bfb69 100644
--- a/cache.h
+++ b/cache.h
@@ -1065,7 +1065,7 @@ extern const unsigned char *nth_packed_object_sha1(struct packed_git *, uint32_t
 extern off_t nth_packed_object_offset(const struct packed_git *, uint32_t);
 extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *);
 extern int is_pack_valid(struct packed_git *);
-extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *);
+extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *, unsigned char *);
 extern unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
 extern unsigned long get_size_from_delta(struct packed_git *, struct pack_window **, off_t);
 extern int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, unsigned long *);
diff --git a/fast-import.c b/fast-import.c
index 6cd19e5..5e94a64 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1303,7 +1303,7 @@ static void *gfi_unpack_entry(
 		 */
 		p->pack_size = pack_size + 20;
 	}
-	return unpack_entry(p, oe->idx.offset, &type, sizep);
+	return unpack_entry(p, oe->idx.offset, &type, sizep, NULL);
 }
 
 static const char *get_mode(const char *str, uint16_t *modep)
diff --git a/pack-check.c b/pack-check.c
index 63a595c..1920bdb 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -105,6 +105,7 @@ static int verify_packfile(struct packed_git *p,
 		void *data;
 		enum object_type type;
 		unsigned long size;
+		off_t curpos = entries[i].offset;
 
 		if (p->index_version > 1) {
 			off_t offset = entries[i].offset;
@@ -116,7 +117,25 @@ static int verify_packfile(struct packed_git *p,
 					    sha1_to_hex(entries[i].sha1),
 					    p->pack_name, (uintmax_t)offset);
 		}
-		data = unpack_entry(p, entries[i].offset, &type, &size);
+		type = unpack_object_header(p, w_curs, &curpos, &size);
+		unuse_pack(w_curs);
+		if (type == OBJ_BLOB) {
+			unsigned char sha1[20];
+			data = unpack_entry(p, entries[i].offset, &type, &size, sha1);
+			if (!data) {
+				if (hashcmp(entries[i].sha1, sha1))
+					err = error("packed %s from %s is corrupt",
+						    sha1_to_hex(entries[i].sha1), p->pack_name);
+				else if (fn) {
+					int eaten = 0;
+					fn(entries[i].sha1, type, size, NULL, &eaten);
+				}
+				if (((base_count + i) & 1023) == 0)
+					display_progress(progress, base_count + i);
+				continue;
+			}
+		}
+		data = unpack_entry(p, entries[i].offset, &type, &size, NULL);
 		if (!data)
 			err = error("cannot unpack %s from %s at offset %"PRIuMAX"",
 				    sha1_to_hex(entries[i].sha1), p->pack_name,
diff --git a/sha1_file.c b/sha1_file.c
index a77ef0a..d68a5b0 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1653,28 +1653,51 @@ static int packed_object_info(struct packed_git *p, off_t obj_offset,
 }
 
 static void *unpack_compressed_entry(struct packed_git *p,
-				    struct pack_window **w_curs,
-				    off_t curpos,
-				    unsigned long size)
+				     struct pack_window **w_curs,
+				     off_t curpos,
+				     unsigned long size,
+				     enum object_type type,
+				     unsigned char *sha1)
 {
+	static unsigned char fixed_buf[8192];
 	int st;
 	git_zstream stream;
 	unsigned char *buffer, *in;
+	git_SHA_CTX c;
+
+	if (sha1) {		/* do hash_sha1_file internally */
+		char hdr[32];
+		int hdrlen = sprintf(hdr, "%s %lu", typename(type), size)+1;
+		git_SHA1_Init(&c);
+		git_SHA1_Update(&c, hdr, hdrlen);
+
+		buffer = fixed_buf;
+	} else {
+		buffer = xmallocz(size);
+	}
 
-	buffer = xmallocz(size);
 	memset(&stream, 0, sizeof(stream));
 	stream.next_out = buffer;
-	stream.avail_out = size + 1;
+	stream.avail_out = buffer == fixed_buf ? sizeof(fixed_buf) : size + 1;
 
 	git_inflate_init(&stream);
 	do {
 		in = use_pack(p, w_curs, curpos, &stream.avail_in);
 		stream.next_in = in;
 		st = git_inflate(&stream, Z_FINISH);
-		if (!stream.avail_out)
+		if (sha1) {
+			git_SHA1_Update(&c, buffer, stream.next_out - (unsigned char *)buffer);
+			stream.next_out = buffer;
+			stream.avail_out = sizeof(fixed_buf);
+		}
+		else if (!stream.avail_out)
 			break; /* the payload is larger than it should be */
 		curpos += stream.next_in - in;
 	} while (st == Z_OK || st == Z_BUF_ERROR);
+	if (sha1) {
+		git_SHA1_Final(sha1, &c);
+		buffer = NULL;
+	}
 	git_inflate_end(&stream);
 	if ((st != Z_STREAM_END) || stream.total_out != size) {
 		free(buffer);
@@ -1727,7 +1750,7 @@ static void *cache_or_unpack_entry(struct packed_git *p, off_t base_offset,
 
 	ret = ent->data;
 	if (!ret || ent->p != p || ent->base_offset != base_offset)
-		return unpack_entry(p, base_offset, type, base_size);
+		return unpack_entry(p, base_offset, type, base_size, NULL);
 
 	if (!keep_cache) {
 		ent->data = NULL;
@@ -1844,7 +1867,7 @@ static void *unpack_delta_entry(struct packed_git *p,
 			return NULL;
 	}
 
-	delta_data = unpack_compressed_entry(p, w_curs, curpos, delta_size);
+	delta_data = unpack_compressed_entry(p, w_curs, curpos, delta_size, OBJ_NONE, NULL);
 	if (!delta_data) {
 		error("failed to unpack compressed delta "
 		      "at offset %"PRIuMAX" from %s",
@@ -1883,7 +1906,8 @@ static void write_pack_access_log(struct packed_git *p, off_t obj_offset)
 int do_check_packed_object_crc;
 
 void *unpack_entry(struct packed_git *p, off_t obj_offset,
-		   enum object_type *type, unsigned long *sizep)
+		   enum object_type *type, unsigned long *sizep,
+		   unsigned char *sha1)
 {
 	struct pack_window *w_curs = NULL;
 	off_t curpos = obj_offset;
@@ -1917,7 +1941,8 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 	case OBJ_TREE:
 	case OBJ_BLOB:
 	case OBJ_TAG:
-		data = unpack_compressed_entry(p, &w_curs, curpos, *sizep);
+		data = unpack_compressed_entry(p, &w_curs, curpos,
+					       *sizep, *type, sha1);
 		break;
 	default:
 		data = NULL;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 7e78c72..c749ecb 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -141,7 +141,7 @@ test_expect_success 'fetch updates' '
 	)
 '
 
-test_expect_failure 'fsck' '
+test_expect_success 'fsck' '
 	git fsck --full
 '
 
-- 
1.7.3.1.256.g2539c.dirty

  parent reply	other threads:[~2012-02-27  7:57 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-27  7:55 [PATCH 00/11] Large blob fixes Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 01/11] Add more large blob test cases Nguyễn Thái Ngọc Duy
2012-02-27 20:18   ` Peter Baumann
2012-02-27  7:55 ` [PATCH 02/11] Factor out and export large blob writing code to arbitrary file handle Nguyễn Thái Ngọc Duy
2012-02-27 17:29   ` Junio C Hamano
2012-02-27 21:50     ` Junio C Hamano
2012-02-27  7:55 ` [PATCH 03/11] cat-file: use streaming interface to print blobs Nguyễn Thái Ngọc Duy
2012-02-27 17:44   ` Junio C Hamano
2012-02-28  1:08     ` Nguyen Thai Ngoc Duy
2012-02-27  7:55 ` [PATCH 04/11] parse_object: special code path for blobs to avoid putting whole object in memory Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 05/11] show: use streaming interface for showing blobs Nguyễn Thái Ngọc Duy
2012-02-27 18:00   ` Junio C Hamano
2012-02-27  7:55 ` [PATCH 06/11] index-pack --verify: skip sha-1 collision test Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 07/11] index-pack: split second pass obj handling into own function Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 08/11] index-pack: reduce memory usage when the pack has large blobs Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` Nguyễn Thái Ngọc Duy [this message]
2012-02-27  7:55 ` [PATCH 10/11] archive: support streaming large files to a tar archive Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 11/11] fsck: use streaming interface for writing lost-found blobs Nguyễn Thái Ngọc Duy
2012-02-27 18:43 ` [PATCH 00/11] Large blob fixes Junio C Hamano
2012-02-28  1:23   ` Nguyen Thai Ngoc Duy
2012-03-04 12:59 ` [PATCH v2 00/10] " Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 01/10] Add more large blob test cases Nguyễn Thái Ngọc Duy
2012-03-06  0:59     ` Junio C Hamano
2012-03-04 12:59   ` [PATCH v2 02/10] streaming: make streaming-write-entry to be more reusable Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 03/10] cat-file: use streaming interface to print blobs Nguyễn Thái Ngọc Duy
2012-03-04 23:12     ` Junio C Hamano
2012-03-05  2:42       ` Nguyen Thai Ngoc Duy
2012-03-04 12:59   ` [PATCH v2 04/10] parse_object: special code path for blobs to avoid putting whole object in memory Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 05/10] show: use streaming interface for showing blobs Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 06/10] index-pack: split second pass obj handling into own function Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 07/10] index-pack: reduce memory usage when the pack has large blobs Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 08/10] pack-check: do not unpack blobs Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 09/10] archive: support streaming large files to a tar archive Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 10/10] fsck: use streaming interface for writing lost-found blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 00/11] Large blob fixes Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 01/11] Add more large blob test cases Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 02/11] streaming: make streaming-write-entry to be more reusable Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 03/11] cat-file: use streaming interface to print blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 04/11] parse_object: special code path for blobs to avoid putting whole object in memory Nguyễn Thái Ngọc Duy
2012-03-06  0:57     ` Junio C Hamano
2012-03-05  3:43   ` [PATCH v3 05/11] show: use streaming interface for showing blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 06/11] index-pack: split second pass obj handling into own function Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 07/11] index-pack: reduce memory usage when the pack has large blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 08/11] pack-check: do not unpack blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 09/11] archive: support streaming large files to a tar archive Nguyễn Thái Ngọc Duy
2012-03-06  0:57     ` Junio C Hamano
2012-03-05  3:43   ` [PATCH v3 10/11] fsck: use streaming interface for writing lost-found blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 11/11] update-server-info: respect core.bigfilethreshold Nguyễn Thái Ngọc Duy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1330329315-11407-10-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).