git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Junio C Hamano <junkio@cox.net>
To: Linus Torvalds <torvalds@osdl.org>
Cc: git@vger.kernel.org
Subject: Re: [PATCH] pack-objects: re-validate data we copy from elsewhere.
Date: Sun, 03 Sep 2006 21:06:59 -0700	[thread overview]
Message-ID: <7vhczos2ak.fsf@assigned-by-dhcp.cox.net> (raw)
In-Reply-To: <7vodtwtw9v.fsf@assigned-by-dhcp.cox.net> (Junio C. Hamano's message of "Sun, 03 Sep 2006 15:34:04 -0700")

Junio C Hamano <junkio@cox.net> writes:

> Linus Torvalds <torvalds@osdl.org> writes:
>
>> Ok. Is it less painful if it just checks the zlib CRC...
>
> I haven't checked myself but somebody said that zlib CRC is of
> preimage so we would need to incur inflate cost anyway if that
> is the case.  But I think it may be a reasonable comproise to
> assume that an existing delta that inflates properly would apply
> to its base object, and if we can assume that we do not have to
> check the inflated xdelta data.
> ...
> Another thing the current check is _not_ doing is for this
> pathological case:
>
>  - .idx table says the pack entry is N bytes
>
>  - unpack_entry_gently() used in the revalidate code uses the
>    usual codepath that says "here is the start of the pack
>    entry; inflate using as much data as you need"; .idx is
>    somehow wrong and it needed N+M bytes where 0 < M.
>
>  - we copy out N bytes because we belive .idx.

So here is a patch against "master" which contains none of the
earlier patches in this thread.

When copying from an existing pack and when copying from a loose
object with new style header, the code makes sure that the piece
we are going to copy out inflates well and inflate() consumes
the data in full while doing so.

The check to see if the xdelta really apply is quite expensive
as you described, because you would need to have the image of
the base object which can be represented as a delta against
something else.

The same repack takes this much with this code.

Total 301361, written 301361 (delta 238935), reused 300995 (delta 238663)
6.66user 0.67system 0:07.40elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+62728minor)pagefaults 0swaps

---
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 46f524d..149fa28 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -65,6 +65,7 @@ static unsigned char pack_file_sha1[20];
 static int progress = 1;
 static volatile sig_atomic_t progress_update;
 static int window = 10;
+static int pack_to_stdout;
 
 /*
  * The object names in objects array are hashed with this hashtable,
@@ -242,6 +243,82 @@ static int encode_header(enum object_typ
 	return n;
 }
 
+static int check_inflate(unsigned char *data, unsigned long len, unsigned long expect)
+{
+	z_stream stream;
+	unsigned char fakebuf[4096];
+	int st;
+
+	memset(&stream, 0, sizeof(stream));
+	stream.next_in = data;
+	stream.avail_in = len;
+	stream.next_out = fakebuf;
+	stream.avail_out = sizeof(fakebuf);
+	inflateInit(&stream);
+
+	while (1) {
+		st = inflate(&stream, Z_FINISH);
+		if (st == Z_STREAM_END || st == Z_OK) {
+			st = (stream.total_out == expect &&
+			      stream.total_in == len) ? 0 : -1;
+			break;
+		}
+		if (st != Z_BUF_ERROR) {
+			st = -1;
+			break;
+		}
+		stream.next_out = fakebuf;
+		stream.avail_out = sizeof(fakebuf);
+	}
+	inflateEnd(&stream);
+	return st;
+}
+
+/*
+ * we are going to reuse the existing pack entry data.  make
+ * sure it is not corrupt.
+ */
+static int revalidate_pack_entry(struct object_entry *entry, unsigned char *data, unsigned long len)
+{
+	enum object_type type;
+	unsigned long size, used;
+
+	if (pack_to_stdout)
+		return 0;
+
+	/* the caller has already called use_packed_git() for us,
+	 * so it is safe to access the pack data from mmapped location.
+	 * make sure the entry inflates correctly.
+	 */
+	used = unpack_object_header_gently(data, len, &type, &size);
+	if (!used)
+		return -1;
+	if (type == OBJ_DELTA)
+		used += 20; /* skip base object name */
+	data += used;
+	len -= used;
+	return check_inflate(data, len, entry->size);
+}
+
+static int revalidate_loose_object(struct object_entry *entry,
+				   unsigned char *map,
+				   unsigned long mapsize)
+{
+	/* we already know this is a loose object with new type header. */
+	enum object_type type;
+	unsigned long size, used;
+
+	if (pack_to_stdout)
+		return 0;
+
+	used = unpack_object_header_gently(map, mapsize, &type, &size);
+	if (!used)
+		return -1;
+	map += used;
+	mapsize -= used;
+	return check_inflate(map, mapsize, size);
+}
+
 static unsigned long write_object(struct sha1file *f,
 				  struct object_entry *entry)
 {
@@ -276,6 +353,9 @@ static unsigned long write_object(struct
 		map = map_sha1_file(entry->sha1, &mapsize);
 		if (map && !legacy_loose_object(map)) {
 			/* We can copy straight into the pack file */
+			if (revalidate_loose_object(entry, map, mapsize))
+				die("corrupt loose object %s",
+				    sha1_to_hex(entry->sha1));
 			sha1write(f, map, mapsize);
 			munmap(map, mapsize);
 			written++;
@@ -286,7 +366,7 @@ static unsigned long write_object(struct
 			munmap(map, mapsize);
 	}
 
-	if (! to_reuse) {
+	if (!to_reuse) {
 		buf = read_sha1_file(entry->sha1, type, &size);
 		if (!buf)
 			die("unable to read %s", sha1_to_hex(entry->sha1));
@@ -319,6 +399,9 @@ static unsigned long write_object(struct
 
 		datalen = find_packed_object_size(p, entry->in_pack_offset);
 		buf = (char *) p->pack_base + entry->in_pack_offset;
+
+		if (revalidate_pack_entry(entry, buf, datalen))
+			die("corrupt delta in pack %s", sha1_to_hex(entry->sha1));
 		sha1write(f, buf, datalen);
 		unuse_packed_git(p);
 		hdrlen = 0; /* not really */
@@ -1163,7 +1246,7 @@ static void prepare_pack(int window, int
 		find_deltas(sorted_by_type, window+1, depth);
 }
 
-static int reuse_cached_pack(unsigned char *sha1, int pack_to_stdout)
+static int reuse_cached_pack(unsigned char *sha1)
 {
 	static const char cache[] = "pack-cache/pack-%s.%s";
 	char *cached_pack, *cached_idx;
@@ -1247,7 +1330,7 @@ int cmd_pack_objects(int argc, const cha
 {
 	SHA_CTX ctx;
 	char line[40 + 1 + PATH_MAX + 2];
-	int depth = 10, pack_to_stdout = 0;
+	int depth = 10;
 	struct object_entry **list;
 	int num_preferred_base = 0;
 	int i;
@@ -1367,7 +1450,7 @@ int cmd_pack_objects(int argc, const cha
 	if (progress && (nr_objects != nr_result))
 		fprintf(stderr, "Result has %d objects.\n", nr_result);
 
-	if (reuse_cached_pack(object_list_sha1, pack_to_stdout))
+	if (reuse_cached_pack(object_list_sha1))
 		;
 	else {
 		if (nr_result)


-- 
VGER BF report: U 0.976258

  reply	other threads:[~2006-09-04  4:07 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <9e4733910608290943g6aa79855q62b98caf4f19510@mail.gmail.com>
     [not found] ` <20060829165811.GB21729@spearce.org>
     [not found]   ` <9e4733910608291037k2d9fb791v18abc19bdddf5e89@mail.gmail.com>
     [not found]     ` <20060829175819.GE21729@spearce.org>
     [not found]       ` <9e4733910608291155g782953bbv5df1b74878f4fcf1@mail.gmail.com>
     [not found]         ` <20060829190548.GK21729@spearce.org>
     [not found]           ` <9e4733910608291252q130fc723r945e6ab906ca6969@mail.gmail.com>
     [not found]             ` <20060829232007.GC22935@spearce.org>
     [not found]               ` <9e4733910608291807q9b896e4sdbfaa9e49de58c2b@mail.gmail.com>
2006-08-30  1:51                 ` Mozilla .git tree Shawn Pearce
2006-08-30  2:25                   ` Shawn Pearce
2006-08-30  2:58                   ` Jon Smirl
2006-08-30  3:10                     ` Shawn Pearce
2006-08-30  3:27                       ` Jon Smirl
2006-08-30  5:53                       ` Nicolas Pitre
2006-08-30 11:42                         ` Junio C Hamano
2006-09-01  7:42                           ` Junio C Hamano
2006-09-02  1:19                             ` Shawn Pearce
2006-09-02  4:01                               ` Junio C Hamano
2006-09-02  4:39                                 ` Shawn Pearce
2006-09-02 11:06                                   ` Junio C Hamano
2006-09-02 14:20                                     ` Jon Smirl
2006-09-02 17:39                                       ` Shawn Pearce
2006-09-02 18:56                                         ` Linus Torvalds
2006-09-02 20:53                                           ` Junio C Hamano
2006-09-02 17:44                                     ` Shawn Pearce
2006-09-02  2:04                             ` Shawn Pearce
2006-09-02 11:02                               ` Junio C Hamano
2006-09-02 17:51                                 ` Shawn Pearce
2006-09-02 20:55                                   ` Junio C Hamano
2006-09-03  3:54                                     ` Shawn Pearce
2006-09-01 17:45                           ` A Large Angry SCM
2006-09-01 18:35                             ` Linus Torvalds
2006-09-01 19:56                               ` Junio C Hamano
2006-09-01 23:14                               ` [PATCH] pack-objects: re-validate data we copy from elsewhere Junio C Hamano
2006-09-02  0:23                                 ` Linus Torvalds
2006-09-02  1:39                                   ` VGER BF report? Johannes Schindelin
2006-09-02  5:58                                     ` Sam Ravnborg
2006-09-02  1:52                                   ` [PATCH] pack-objects: re-validate data we copy from elsewhere Junio C Hamano
2006-09-02  3:52                                   ` Junio C Hamano
2006-09-02  4:52                                     ` Shawn Pearce
2006-09-02  9:42                                       ` Junio C Hamano
2006-09-02 17:43                                         ` Linus Torvalds
2006-09-02 10:09                                       ` Junio C Hamano
2006-09-02 17:54                                         ` Shawn Pearce
2006-09-03 21:00                                           ` Junio C Hamano
2006-09-04  4:10                                             ` Shawn Pearce
2006-09-04  5:50                                               ` Junio C Hamano
2006-09-04  6:44                                                 ` Shawn Pearce
2006-09-04  7:39                                                   ` Junio C Hamano
2006-09-03  0:27                                         ` Linus Torvalds
2006-09-03  0:32                                           ` Junio C Hamano
2006-09-05  8:12                                           ` Junio C Hamano
2006-09-02 18:43                                     ` Linus Torvalds
2006-09-02 20:56                                       ` Junio C Hamano
2006-09-03 21:48                                       ` Junio C Hamano
2006-09-03 22:00                                         ` Linus Torvalds
2006-09-03 22:16                                           ` Linus Torvalds
2006-09-03 22:34                                           ` Junio C Hamano
2006-09-04  4:06                                             ` Junio C Hamano [this message]
2006-09-04 15:19                                               ` Linus Torvalds

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7vhczos2ak.fsf@assigned-by-dhcp.cox.net \
    --to=junkio@cox.net \
    --cc=git@vger.kernel.org \
    --cc=torvalds@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).