[PATCH] Implement --fuzz= option for git-apply.

Git development
 help / color / mirror / Atom feed

* [PATCH] Implement --fuzz= option for git-apply.
@ 2006-04-10  2:41 Eric W. Biederman
  2006-04-10  5:52 ` Eric W. Biederman
  0 siblings, 1 reply; 8+ messages in thread
From: Eric W. Biederman @ 2006-04-10  2:41 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git


Currently to import the -mm tree I have to work around
git-apply by using patch.  Because some of Andrews
patches in quilt will only apply with fuzz.

Allow git-apply to handle fuzz makes it much easier to import
the -mm tree into git.  I am still only processing about 1.5 patch a
second which for the 692 patches in 2.6.17-rc1-mm2 is still painful
but it does help.

If I just apply the patches and don't run git-mailinfo
git-write-tree, and git-write-commit I get about 4 patches
per second.

This patch defaults to leaving fuzz processing off so if you don't
want patches that only apply with fuzz you won't get them.

If a patch does require fuzz to apply you will get a warning:
> Fragment applied at offset: +-#lines (fuzz: #context_lines_deleted)

diff --git a/apply.c b/apply.c
index 33b4271..a07503f 100644
--- a/apply.c
+++ b/apply.c
@@ -32,8 +32,9 @@ static int apply = 1;
 static int no_add = 0;
 static int show_index_info = 0;
 static int line_termination = '\n';
+static int p_fuzz = 0;
 static const char apply_usage[] =
-"git-apply [--stat] [--numstat] [--summary] [--check] [--index] [--apply] [--no-add] [--index-info] [--allow-binary-replacement] [-z] [-pNUM] [--whitespace=<nowarn|warn|error|error-all|strip>] <patch>...";
+"git-apply [--stat] [--numstat] [--summary] [--check] [--index] [--apply] [--no-add] [--index-info] [--allow-binary-replacement] [-z] [-pNUM] [--fuzz=NUM] [--whitespace=<nowarn|warn|error|error-all|strip>] <patch>...";
 
 static enum whitespace_eol {
 	nowarn_whitespace,
@@ -100,6 +101,7 @@ static int max_change, max_len;
 static int linenr = 1;
 
 struct fragment {
+	unsigned long context;
 	unsigned long oldpos, oldlines;
 	unsigned long newpos, newlines;
 	const char *patch;
@@ -817,12 +819,15 @@ static int parse_fragment(char *line, un
 	int added, deleted;
 	int len = linelen(line, size), offset;
 	unsigned long oldlines, newlines;
+	unsigned long leading, trailing;
 
 	offset = parse_fragment_header(line, len, fragment);
 	if (offset < 0)
 		return -1;
 	oldlines = fragment->oldlines;
 	newlines = fragment->newlines;
+	leading = 0;
+	trailing = 0;
 
 	if (patch->is_new < 0) {
 		patch->is_new =  !oldlines;
@@ -860,10 +865,14 @@ static int parse_fragment(char *line, un
 		case ' ':
 			oldlines--;
 			newlines--;
+			if (!deleted && !added)
+				leading++;
+			trailing++;
 			break;
 		case '-':
 			deleted++;
 			oldlines--;
+			trailing = 0;
 			break;
 		case '+':
 			/*
@@ -887,6 +896,7 @@ static int parse_fragment(char *line, un
 			}
 			added++;
 			newlines--;
+			trailing = 0;
 			break;
 
                 /* We allow "\ No newline at end of file". Depending
@@ -904,6 +914,10 @@ static int parse_fragment(char *line, un
 	}
 	if (oldlines || newlines)
 		return -1;
+	fragment->context = leading;
+	if (leading > trailing)
+		fragment->context = trailing;
+
 	/* If a fragment ends with an incomplete line, we failed to include
 	 * it in the above loop because we hit oldlines == newlines == 0
 	 * before seeing it.
@@ -1087,7 +1101,7 @@ static int read_old_data(struct stat *st
 	}
 }
 
-static int find_offset(const char *buf, unsigned long size, const char *fragment, unsigned long fragsize, int line)
+static int find_offset(const char *buf, unsigned long size, const char *fragment, unsigned long fragsize, int line, int *lines)
 {
 	int i;
 	unsigned long start, backwards, forwards;
@@ -1148,6 +1162,7 @@ static int find_offset(const char *buf, 
 		n = (i >> 1)+1;
 		if (i & 1)
 			n = -n;
+		*lines = n;
 		return try;
 	}
 
@@ -1155,6 +1170,31 @@ static int find_offset(const char *buf, 
 	 * We should start searching forward and backward.
 	 */
 	return -1;
+}
+
+static void reduce_context(char **buf, int *size)
+{
+	char *ctx = *buf;
+	unsigned long ctxsize = *size;
+	unsigned long offset;
+
+	/* Remove the first line */
+	offset = 0;
+	while (offset <= ctxsize) {
+		if (ctx[offset++] == '\n')
+			break;
+	}
+	ctxsize -= offset;
+	ctx += offset;
+	/* Remove the last line */
+	offset = ctxsize - 1;
+	while (offset > 0) {
+		if (ctx[--offset] == '\n')
+			break;
+	}
+	ctxsize = offset + 1;
+	*buf = ctx;
+	*size = ctxsize;
 }
 
 struct buffer_desc {
@@ -1192,7 +1232,10 @@ static int apply_one_fragment(struct buf
 	int offset, size = frag->size;
 	char *old = xmalloc(size);
 	char *new = xmalloc(size);
-	int oldsize = 0, newsize = 0;
+	char *ctx;
+	int oldsize = 0, newsize = 0, ctxsize;
+	int lines;
+	int fuzz, max_fuzz;
 
 	while (size > 0) {
 		int len = linelen(patch, size);
@@ -1241,23 +1284,39 @@ #ifdef NO_ACCURATE_DIFF
 		newsize--;
 	}
 #endif
+
+	offset = -1; /* shutup gcc */
+	ctx = old;
+	ctxsize = oldsize;
+	lines = 0;
+	max_fuzz = (p_fuzz < frag->context) ? p_fuzz : frag->context;
+	for (fuzz = 0; fuzz <= max_fuzz; fuzz++) {
+		/* Reduce the number of context lines */
+		if (fuzz) 
+			reduce_context(&ctx, &ctxsize);
+		offset = find_offset(buf, desc->size, ctx, ctxsize, frag->newpos + fuzz, &lines);
+		if (offset >= 0) {
+			int diff = newsize - ctxsize;
+			unsigned long size = desc->size + diff;
+			unsigned long alloc = desc->alloc;
+
+			if (fuzz)
+				fprintf(stderr, "Fragment applied at offset: %d (fuzz: %d)\n",
+					lines, fuzz);
+
+			if (size > alloc) {
+				alloc = size + 8192;
+				desc->alloc = alloc;
+				buf = xrealloc(buf, alloc);
+				desc->buffer = buf;
+			}
+			desc->size = size;
+			memmove(buf + offset + newsize, buf + offset + ctxsize, size - offset - newsize);
+			memcpy(buf + offset, new, newsize);
+			offset = 0;
 			
-	offset = find_offset(buf, desc->size, old, oldsize, frag->newpos);
-	if (offset >= 0) {
-		int diff = newsize - oldsize;
-		unsigned long size = desc->size + diff;
-		unsigned long alloc = desc->alloc;
-
-		if (size > alloc) {
-			alloc = size + 8192;
-			desc->alloc = alloc;
-			buf = xrealloc(buf, alloc);
-			desc->buffer = buf;
+			break;
 		}
-		desc->size = size;
-		memmove(buf + offset + newsize, buf + offset + oldsize, size - offset - newsize);
-		memcpy(buf + offset, new, newsize);
-		offset = 0;
 	}
 
 	free(old);
@@ -1943,6 +2002,10 @@ int main(int argc, char **argv)
 		}
 		if (!strcmp(arg, "-z")) {
 			line_termination = 0;
+			continue;
+		}
+		if (!strncmp(arg, "--fuzz=", 7)) {
+			p_fuzz = atoi(arg + 7);
 			continue;
 		}
 		if (!strncmp(arg, "--whitespace=", 13)) {

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] Implement --fuzz= option for git-apply.
  2006-04-10  2:41 [PATCH] Implement --fuzz= option for git-apply Eric W. Biederman
@ 2006-04-10  5:52 ` Eric W. Biederman
  2006-04-10  9:33   ` [PATCH] Implement limited context matching in git-apply Eric W. Biederman
  0 siblings, 1 reply; 8+ messages in thread
From: Eric W. Biederman @ 2006-04-10  5:52 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

ebiederm@xmission.com (Eric W. Biederman) writes:

> Currently to import the -mm tree I have to work around
> git-apply by using patch.  Because some of Andrews
> patches in quilt will only apply with fuzz.
>
> Allow git-apply to handle fuzz makes it much easier to import
> the -mm tree into git.  I am still only processing about 1.5 patch a
> second which for the 692 patches in 2.6.17-rc1-mm2 is still painful
> but it does help.
>
> If I just apply the patches and don't run git-mailinfo
> git-write-tree, and git-write-commit I get about 4 patches
> per second.
>
> This patch defaults to leaving fuzz processing off so if you don't
> want patches that only apply with fuzz you won't get them.
>
> If a patch does require fuzz to apply you will get a warning:
>> Fragment applied at offset: +-#lines (fuzz: #context_lines_deleted)

Bother I almost had it right the first time.
I forgot to remove the context lines from the new lines that we
apply, in addition to the old lines that we remove.  This updated
patch fixes that problem.

Context lines patching themselves in is a weird bug.

Eric


diff --git a/apply.c b/apply.c
index 33b4271..4faf365 100644
--- a/apply.c
+++ b/apply.c
@@ -32,8 +32,9 @@ static int apply = 1;
 static int no_add = 0;
 static int show_index_info = 0;
 static int line_termination = '\n';
+static int p_fuzz = 0;
 static const char apply_usage[] =
-"git-apply [--stat] [--numstat] [--summary] [--check] [--index] [--apply] [--no-add] [--index-info] [--allow-binary-replacement] [-z] [-pNUM] [--whitespace=<nowarn|warn|error|error-all|strip>] <patch>...";
+"git-apply [--stat] [--numstat] [--summary] [--check] [--index] [--apply] [--no-add] [--index-info] [--allow-binary-replacement] [-z] [-pNUM] [--fuzz=NUM] [--whitespace=<nowarn|warn|error|error-all|strip>] <patch>...";
 
 static enum whitespace_eol {
 	nowarn_whitespace,
@@ -100,6 +101,7 @@ static int max_change, max_len;
 static int linenr = 1;
 
 struct fragment {
+	unsigned long context;
 	unsigned long oldpos, oldlines;
 	unsigned long newpos, newlines;
 	const char *patch;
@@ -817,12 +819,15 @@ static int parse_fragment(char *line, un
 	int added, deleted;
 	int len = linelen(line, size), offset;
 	unsigned long oldlines, newlines;
+	unsigned long leading, trailing;
 
 	offset = parse_fragment_header(line, len, fragment);
 	if (offset < 0)
 		return -1;
 	oldlines = fragment->oldlines;
 	newlines = fragment->newlines;
+	leading = 0;
+	trailing = 0;
 
 	if (patch->is_new < 0) {
 		patch->is_new =  !oldlines;
@@ -860,10 +865,14 @@ static int parse_fragment(char *line, un
 		case ' ':
 			oldlines--;
 			newlines--;
+			if (!deleted && !added)
+				leading++;
+			trailing++;
 			break;
 		case '-':
 			deleted++;
 			oldlines--;
+			trailing = 0;
 			break;
 		case '+':
 			/*
@@ -887,6 +896,7 @@ static int parse_fragment(char *line, un
 			}
 			added++;
 			newlines--;
+			trailing = 0;
 			break;
 
                 /* We allow "\ No newline at end of file". Depending
@@ -904,6 +914,10 @@ static int parse_fragment(char *line, un
 	}
 	if (oldlines || newlines)
 		return -1;
+	fragment->context = leading;
+	if (leading > trailing)
+		fragment->context = trailing;
+
 	/* If a fragment ends with an incomplete line, we failed to include
 	 * it in the above loop because we hit oldlines == newlines == 0
 	 * before seeing it.
@@ -1087,7 +1101,7 @@ static int read_old_data(struct stat *st
 	}
 }
 
-static int find_offset(const char *buf, unsigned long size, const char *fragment, unsigned long fragsize, int line)
+static int find_offset(const char *buf, unsigned long size, const char *fragment, unsigned long fragsize, int line, int *lines)
 {
 	int i;
 	unsigned long start, backwards, forwards;
@@ -1148,6 +1162,7 @@ static int find_offset(const char *buf, 
 		n = (i >> 1)+1;
 		if (i & 1)
 			n = -n;
+		*lines = n;
 		return try;
 	}
 
@@ -1155,6 +1170,31 @@ static int find_offset(const char *buf, 
 	 * We should start searching forward and backward.
 	 */
 	return -1;
+}
+
+static void reduce_context(char **buf, int *size)
+{
+	char *ctx = *buf;
+	unsigned long ctxsize = *size;
+	unsigned long offset;
+
+	/* Remove the first line */
+	offset = 0;
+	while (offset <= ctxsize) {
+		if (ctx[offset++] == '\n')
+			break;
+	}
+	ctxsize -= offset;
+	ctx += offset;
+	/* Remove the last line */
+	offset = ctxsize - 1;
+	while (offset > 0) {
+		if (ctx[--offset] == '\n')
+			break;
+	}
+	ctxsize = offset + 1;
+	*buf = ctx;
+	*size = ctxsize;
 }
 
 struct buffer_desc {
@@ -1192,7 +1232,10 @@ static int apply_one_fragment(struct buf
 	int offset, size = frag->size;
 	char *old = xmalloc(size);
 	char *new = xmalloc(size);
+	char *oldlines, *newlines;
 	int oldsize = 0, newsize = 0;
+	int lines;
+	int fuzz, max_fuzz;
 
 	while (size > 0) {
 		int len = linelen(patch, size);
@@ -1241,23 +1284,42 @@ #ifdef NO_ACCURATE_DIFF
 		newsize--;
 	}
 #endif
+
+	offset = -1; /* shutup gcc */
+	oldlines = old;
+	newlines = new;
+	lines = 0;
+	max_fuzz = (p_fuzz < frag->context) ? p_fuzz : frag->context;
+	for (fuzz = 0; fuzz <= max_fuzz; fuzz++) {
+		/* Reduce the number of context lines */
+		if (fuzz) {
+			reduce_context(&oldlines, &oldsize);
+			reduce_context(&newlines, &newsize);
+		}
 			
-	offset = find_offset(buf, desc->size, old, oldsize, frag->newpos);
-	if (offset >= 0) {
-		int diff = newsize - oldsize;
-		unsigned long size = desc->size + diff;
-		unsigned long alloc = desc->alloc;
-
-		if (size > alloc) {
-			alloc = size + 8192;
-			desc->alloc = alloc;
-			buf = xrealloc(buf, alloc);
-			desc->buffer = buf;
+		offset = find_offset(buf, desc->size, oldlines, oldsize, frag->newpos + fuzz, &lines);
+		if (offset >= 0) {
+			int diff = newsize - oldsize;
+			unsigned long size = desc->size + diff;
+			unsigned long alloc = desc->alloc;
+
+			if (fuzz)
+				fprintf(stderr, "Fragment applied at offset: %d (fuzz: %d)\n",
+					lines, fuzz);
+
+			if (size > alloc) {
+				alloc = size + 8192;
+				desc->alloc = alloc;
+				buf = xrealloc(buf, alloc);
+				desc->buffer = buf;
+			}
+			desc->size = size;
+			memmove(buf + offset + newsize, buf + offset + oldsize, size - offset - newsize);
+			memcpy(buf + offset, newlines, newsize);
+			offset = 0;
+			
+			break;
 		}
-		desc->size = size;
-		memmove(buf + offset + newsize, buf + offset + oldsize, size - offset - newsize);
-		memcpy(buf + offset, new, newsize);
-		offset = 0;
 	}
 
 	free(old);
@@ -1943,6 +2005,10 @@ int main(int argc, char **argv)
 		}
 		if (!strcmp(arg, "-z")) {
 			line_termination = 0;
+			continue;
+		}
+		if (!strncmp(arg, "--fuzz=", 7)) {
+			p_fuzz = atoi(arg + 7);
 			continue;
 		}
 		if (!strncmp(arg, "--whitespace=", 13)) {

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] Implement limited context matching in git-apply.
  2006-04-10  5:52 ` Eric W. Biederman
@ 2006-04-10  9:33   ` Eric W. Biederman
  2006-04-10 15:25     ` Linus Torvalds
  2006-04-10 19:29     ` Junio C Hamano
  0 siblings, 2 replies; 8+ messages in thread
From: Eric W. Biederman @ 2006-04-10  9:33 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git


Ok this really should be the good version.  The option
handling has been reworked to be automation safe.

Currently to import the -mm tree I have to work around
git-apply by using patch.  Because some of Andrews
patches in quilt will only apply with fuzz.

I started out implementing a --fuzz option and then I realized
fuzz is not a very safe concept for an automated system.  What
you really want is a minimum number of context lines that must
match.  This allows policy to be set without knowing how many
lines of context a patch actually provides.   By default
the policy remains to match all provided lines of context.

Allowng git-apply to match a restricted set of context makes
it much easier to import the -mm tree into git.  I am still only
processing  1.5 to 1.6 patches a second for the 692 patches in
2.6.17-rc1-mm2 is still painful but it does help.

If I just loop through all of Andrews patches in order
and run git-apply --index -C1 I process the entire patchset
in 1m53s or about 6 patches per second.  So running
git-mailinfo, git-write-tree, git-commit-tree, and
git-update-ref everytime has a measurable impact,
and shows things can be speeded up even more.

All of these timings were taking on my poor 700Mhz Athlon
with 512MB of ram.  So people with fast machiens should
see much better performance.

When a match is found after the number of context are reduced a
warning is generated.  Since this is a rare event and possibly
dangerous this seems to make sense.  Unless you are patching
a single file the error message is a little bit terse at
the moment, but it should be easy to go back and fix.

I have also updated the documentation for git-apply to reflect
the new -C option that sets the minimum number of context
lines that must match.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>


---

 Documentation/git-apply.txt |    8 ++-
 apply.c                     |  121 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 111 insertions(+), 18 deletions(-)

6b3a4565b760664a9b72096dd5eea8be9e1d1311
diff --git a/Documentation/git-apply.txt b/Documentation/git-apply.txt
index 1c64a1a..e93ea1f 100644
--- a/Documentation/git-apply.txt
+++ b/Documentation/git-apply.txt
@@ -11,7 +11,7 @@ SYNOPSIS
 [verse]
 'git-apply' [--stat] [--numstat] [--summary] [--check] [--index] [--apply]
 	  [--no-add] [--index-info] [--allow-binary-replacement] [-z] [-pNUM]
-	  [--whitespace=<nowarn|warn|error|error-all|strip>]
+	  [-CNUM] [--whitespace=<nowarn|warn|error|error-all|strip>]
 	  [<patch>...]
 
 DESCRIPTION
@@ -72,6 +72,12 @@ OPTIONS
 -p<n>::
 	Remove <n> leading slashes from traditional diff paths. The
 	default is 1.
+
+-C<n>::
+	Ensure at least <n> lines of surrounding context match before
+	and after each change.  When fewer lines of surrounding
+	context exist they all most match.  By default no context is
+	ever ignored.
 
 --apply::
 	If you use any of the options marked ``Turns off
diff --git a/apply.c b/apply.c
index 33b4271..147a919 100644
--- a/apply.c
+++ b/apply.c
@@ -32,8 +32,9 @@ static int apply = 1;
 static int no_add = 0;
 static int show_index_info = 0;
 static int line_termination = '\n';
+static unsigned long p_context = -1;
 static const char apply_usage[] =
-"git-apply [--stat] [--numstat] [--summary] [--check] [--index] [--apply] [--no-add] [--index-info] [--allow-binary-replacement] [-z] [-pNUM] [--whitespace=<nowarn|warn|error|error-all|strip>] <patch>...";
+"git-apply [--stat] [--numstat] [--summary] [--check] [--index] [--apply] [--no-add] [--index-info] [--allow-binary-replacement] [-z] [-pNUM] [-CNUM] [--whitespace=<nowarn|warn|error|error-all|strip>] <patch>...";
 
 static enum whitespace_eol {
 	nowarn_whitespace,
@@ -100,6 +101,7 @@ static int max_change, max_len;
 static int linenr = 1;
 
 struct fragment {
+	unsigned long leading, trailing;
 	unsigned long oldpos, oldlines;
 	unsigned long newpos, newlines;
 	const char *patch;
@@ -817,12 +819,15 @@ static int parse_fragment(char *line, un
 	int added, deleted;
 	int len = linelen(line, size), offset;
 	unsigned long oldlines, newlines;
+	unsigned long leading, trailing;
 
 	offset = parse_fragment_header(line, len, fragment);
 	if (offset < 0)
 		return -1;
 	oldlines = fragment->oldlines;
 	newlines = fragment->newlines;
+	leading = 0;
+	trailing = 0;
 
 	if (patch->is_new < 0) {
 		patch->is_new =  !oldlines;
@@ -860,10 +865,14 @@ static int parse_fragment(char *line, un
 		case ' ':
 			oldlines--;
 			newlines--;
+			if (!deleted && !added)
+				leading++;
+			trailing++;
 			break;
 		case '-':
 			deleted++;
 			oldlines--;
+			trailing = 0;
 			break;
 		case '+':
 			/*
@@ -887,6 +896,7 @@ static int parse_fragment(char *line, un
 			}
 			added++;
 			newlines--;
+			trailing = 0;
 			break;
 
                 /* We allow "\ No newline at end of file". Depending
@@ -904,6 +914,9 @@ static int parse_fragment(char *line, un
 	}
 	if (oldlines || newlines)
 		return -1;
+	fragment->leading = leading;
+	fragment->trailing = trailing;
+
 	/* If a fragment ends with an incomplete line, we failed to include
 	 * it in the above loop because we hit oldlines == newlines == 0
 	 * before seeing it.
@@ -1087,7 +1100,7 @@ static int read_old_data(struct stat *st
 	}
 }
 
-static int find_offset(const char *buf, unsigned long size, const char *fragment, unsigned long fragsize, int line)
+static int find_offset(const char *buf, unsigned long size, const char *fragment, unsigned long fragsize, int line, int *lines)
 {
 	int i;
 	unsigned long start, backwards, forwards;
@@ -1148,6 +1161,7 @@ static int find_offset(const char *buf, 
 		n = (i >> 1)+1;
 		if (i & 1)
 			n = -n;
+		*lines = n;
 		return try;
 	}
 
@@ -1155,6 +1169,33 @@ static int find_offset(const char *buf, 
 	 * We should start searching forward and backward.
 	 */
 	return -1;
+}
+
+static void remove_first_line(const char **rbuf, int *rsize)
+{
+	const char *buf = *rbuf;
+	int size = *rsize;
+	unsigned long offset;
+	offset = 0;
+	while (offset <= size) {
+		if (buf[offset++] == '\n')
+			break;
+	}
+	*rsize = size - offset;
+	*rbuf = buf + offset;
+}
+
+static void remove_last_line(const char **rbuf, int *rsize)
+{
+	const char *buf = *rbuf;
+	int size = *rsize;
+	unsigned long offset;
+	offset = size - 1;
+	while (offset > 0) {
+		if (buf[--offset] == '\n')
+			break;
+	}
+	*rsize = offset + 1;
 }
 
 struct buffer_desc {
@@ -1192,7 +1233,10 @@ static int apply_one_fragment(struct buf
 	int offset, size = frag->size;
 	char *old = xmalloc(size);
 	char *new = xmalloc(size);
+	const char *oldlines, *newlines;
 	int oldsize = 0, newsize = 0;
+	unsigned long leading, trailing;
+	int pos, lines;
 
 	while (size > 0) {
 		int len = linelen(patch, size);
@@ -1241,23 +1285,59 @@ #ifdef NO_ACCURATE_DIFF
 		newsize--;
 	}
 #endif
+
+	oldlines = old;
+	newlines = new;
+	leading = frag->leading;
+	trailing = frag->trailing;
+	lines = 0;
+	pos = frag->newpos;
+	for (;;) {
+		offset = find_offset(buf, desc->size, oldlines, oldsize, pos, &lines);
+		if (offset >= 0) {
+			int diff = newsize - oldsize;
+			unsigned long size = desc->size + diff;
+			unsigned long alloc = desc->alloc;
+
+			/* Warn if it was necessary to reduce the number 
+			 * of context lines.
+			 */
+			if ((leading != frag->leading) || (trailing != frag->trailing))
+				fprintf(stderr, "Context reduced to (%ld/%ld) to apply fragment at %d\n",
+					leading, trailing, pos + lines);
+
+			if (size > alloc) {
+				alloc = size + 8192;
+				desc->alloc = alloc;
+				buf = xrealloc(buf, alloc);
+				desc->buffer = buf;
+			}
+			desc->size = size;
+			memmove(buf + offset + newsize, buf + offset + oldsize, size - offset - newsize);
+			memcpy(buf + offset, newlines, newsize);
+			offset = 0;
 			
-	offset = find_offset(buf, desc->size, old, oldsize, frag->newpos);
-	if (offset >= 0) {
-		int diff = newsize - oldsize;
-		unsigned long size = desc->size + diff;
-		unsigned long alloc = desc->alloc;
-
-		if (size > alloc) {
-			alloc = size + 8192;
-			desc->alloc = alloc;
-			buf = xrealloc(buf, alloc);
-			desc->buffer = buf;
+			break;
 		}
-		desc->size = size;
-		memmove(buf + offset + newsize, buf + offset + oldsize, size - offset - newsize);
-		memcpy(buf + offset, new, newsize);
-		offset = 0;
+
+		/* Am I at my context limits? */
+		if ((leading <= p_context) && (trailing <= p_context))
+			break;
+		/* Reduce the number of context lines
+		 * Reduce both leading and trailing if they are equal
+		 * otherwise just reduce the larger context.
+		 */
+		if (leading >= trailing) {
+			remove_first_line(&oldlines, &oldsize);
+			remove_first_line(&newlines, &newsize);
+			pos--;
+			leading--;
+		}
+		if (trailing > leading) {
+			remove_last_line(&oldlines, &oldsize);
+			remove_last_line(&newlines, &newsize);
+			trailing--;
+		}
 	}
 
 	free(old);
@@ -1882,6 +1962,7 @@ int main(int argc, char **argv)
 
 	for (i = 1; i < argc; i++) {
 		const char *arg = argv[i];
+		char *end;
 		int fd;
 
 		if (!strcmp(arg, "-")) {
@@ -1943,6 +2024,12 @@ int main(int argc, char **argv)
 		}
 		if (!strcmp(arg, "-z")) {
 			line_termination = 0;
+			continue;
+		}
+		if (!strncmp(arg, "-C", 2)) {
+			p_context = strtoul(arg + 2, &end, 0);
+			if (*end != '\0')
+				die("unrecognized context count '%s'", arg + 2);
 			continue;
 		}
 		if (!strncmp(arg, "--whitespace=", 13)) {
-- 
1.3-rc3.GIT

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] Implement limited context matching in git-apply.
  2006-04-10  9:33   ` [PATCH] Implement limited context matching in git-apply Eric W. Biederman
@ 2006-04-10 15:25     ` Linus Torvalds
  2006-04-10 18:35       ` Eric W. Biederman
  2006-04-10 19:29     ` Junio C Hamano
  1 sibling, 1 reply; 8+ messages in thread
From: Linus Torvalds @ 2006-04-10 15:25 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Junio C Hamano, git

On Mon, 10 Apr 2006, Eric W. Biederman wrote:
> 
> If I just loop through all of Andrews patches in order
> and run git-apply --index -C1 I process the entire patchset
> in 1m53s or about 6 patches per second.  So running
> git-mailinfo, git-write-tree, git-commit-tree, and
> git-update-ref everytime has a measurable impact,
> and shows things can be speeded up even more.

git-write-tree is actually a fairly expensive operation on the kernel. It 
needs to write the 1000+ tree objects - and while _most_ of them already 
exist (and thus don't actually need to be written out), we need to 
generate the tree object and its SHA1 in order to notice that that is the 
case.

I'm almost certain that 90%+ of the overhead you see is the tree writing, 
not the rest of the scripting.

Your patch looks ok from a quick read-through:

Acked-by: Linus Torvalds <torvalds@osdl.org>

		Linus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Implement limited context matching in git-apply.
  2006-04-10 15:25     ` Linus Torvalds
@ 2006-04-10 18:35       ` Eric W. Biederman
  2006-04-11 18:23         ` Linus Torvalds
  0 siblings, 1 reply; 8+ messages in thread
From: Eric W. Biederman @ 2006-04-10 18:35 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Junio C Hamano, git

Linus Torvalds <torvalds@osdl.org> writes:

> On Mon, 10 Apr 2006, Eric W. Biederman wrote:
>> 
>> If I just loop through all of Andrews patches in order
>> and run git-apply --index -C1 I process the entire patchset
>> in 1m53s or about 6 patches per second.  So running
>> git-mailinfo, git-write-tree, git-commit-tree, and
>> git-update-ref everytime has a measurable impact,
>> and shows things can be speeded up even more.
>
> git-write-tree is actually a fairly expensive operation on the kernel. It 
> needs to write the 1000+ tree objects - and while _most_ of them already 
> exist (and thus don't actually need to be written out), we need to 
> generate the tree object and its SHA1 in order to notice that that is the 
> case.
>
> I'm almost certain that 90%+ of the overhead you see is the tree writing, 
> not the rest of the scripting.

Well it is easy enough to time.  Looking at the timings
going from just git-apply to git-apply && git-write-tree
does seem to about the double the amount of time taken,
or take me to about 4 minutes.  With everything else
in there things happen in the 6-7 minute range with
in the hot cache scenario.  So write-tree is closer
to 50% of the overhead.

Is it possible to cache the sha1 of unmodified directories?

If we did that we could probe to see if the hash already
existed before we attempted to look for the subdirectories.

The pain would is remembering which directory sha1 are
current.  If nothing else we can modify: 
remove_cache_entry, and add_file_to_cache to clear
the parent directories cached sha1 when we update an
index entry.  But I keep thinking there should
be something more elegant.  Like using ce_flags,
or comparing mtime values.

...

Ok taking a quick look at write-tree to see where
the bottle neck is:  

I made two modified versions of write-tree. 
- git-write-tree-nowritetree which calls return just before calling
    write_tree.
- git-write-tree-nosha1write which does everything except call
    sha1_file_write.

With just git-apply and git-write-tree-nosha1write it takes
me about 3m:20s to process 2.6.17-rc1-mm2.

With just git-apply and git-write-tree-nowritetree it takes:
real    2m59.985s
user    1m38.353s
sys     0m31.445s

With just git-apply and /bin/true it takes:
real    2m1.581s
user    1m3.169s
sys     0m29.903s

Looking at the individual numbers:
$ time git-write-tree-nowritetree --missing-ok

real    0m0.158s
user    0m0.052s
sys     0m0.008s
$ time git-write-tree-nowritetree --missing-ok

real    0m0.155s
user    0m0.057s
sys     0m0.003s
$ time git-write-tree-nowritetree --missing-ok 
real    0m0.065s
user    0m0.057s
sys     0m0.002s
$ time git-write-tree-nowritetree --missing-ok

real    0m0.159s
user    0m0.055s
sys     0m0.005s
$ time git-write-tree-nowritetree --missing-ok

real    0m0.151s
user    0m0.054s
sys     0m0.007s
$ time git-write-tree-nowritetree --missing-ok

real    0m0.154s
user    0m0.056s
sys     0m0.005s

$ time git-write-tree-nosha1write --missing-ok
0000000000000000000000000000000000000000

real    0m0.199s
user    0m0.091s
sys     0m0.008s
$ time git-write-tree-nosha1write --missing-ok
0000000000000000000000000000000000000000

real    0m0.195s
user    0m0.094s
sys     0m0.007s
$ time git-write-tree-nosha1write --missing-ok
0000000000000000000000000000000000000000

real    0m0.198s
user    0m0.092s
sys     0m0.009s

$ time git-write-tree --missing-ok
0ecfe3dbc2e65aa9638c62abf0cf05057c77f884

real    0m0.217s
user    0m0.113s
sys     0m0.012s

$ time git-write-tree
0ecfe3dbc2e65aa9638c62abf0cf05057c77f884

real    0m0.276s
user    0m0.169s
sys     0m0.008s

So at a quick inspection it looks to me like:
About .059s to perform to check for missing files.
About .019s to write the new tree.
About .155s in start up overhead, read_cache, and sanity checks.

So at a first glance it looks like librification to
allow the redundant work to be skipped, is where
the big speed win on my machine would be.

> Your patch looks ok from a quick read-through:

Thanks.

My import of 2.6.17-rc1-mm2 gives exactly the same
result as simply applying Andrews patch.  Which while
not definitive hits a lot of interesting cases.

> Acked-by: Linus Torvalds <torvalds@osdl.org>
>
> 		Linus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Implement limited context matching in git-apply.
  2006-04-10  9:33   ` [PATCH] Implement limited context matching in git-apply Eric W. Biederman
  2006-04-10 15:25     ` Linus Torvalds
@ 2006-04-10 19:29     ` Junio C Hamano
  1 sibling, 0 replies; 8+ messages in thread
From: Junio C Hamano @ 2006-04-10 19:29 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: git

ebiederm@xmission.com (Eric W. Biederman) writes:

> If I just loop through all of Andrews patches in order
> and run git-apply --index -C1 I process the entire patchset
> in 1m53s or about 6 patches per second.  So running
> git-mailinfo, git-write-tree, git-commit-tree, and
> git-update-ref everytime has a measurable impact,
> and shows things can be speeded up even more.

Although I haven't "read" it, but just only "looked at" it, the
patch looks OK.  I haven't managed to start beating on it yet
for time constraints.

If you are dealing with the kernel tree, I suspect most time is
spent on write-tree.  Statistically, a typical kernel patch (I
haven't counted the ones in -mm series, but only the ones
actually reacheable from Linus tip) touches only 3 files on
average, so most of the 1,100 tree objects in a typical kernel
tree are computed but found unchanged when write-tree happens.

I suspect we could make a backward incompatible change to the
index file format to record the top-level tree object names
somewhere where normal cache-entry walker would not see.  Then
when anybody makes a modification to invalidate that tree
object, mark that tree (or split that tree to read lower level
trees lazily) to force us recompute the tree object.

Theoretically you could do that recursively to record all 1,100
tree objects but that would make the cache slightly larger (say,
by 100kB).

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Implement limited context matching in git-apply.
  2006-04-10 18:35       ` Eric W. Biederman
@ 2006-04-11 18:23         ` Linus Torvalds
  2006-04-13 12:02           ` Eric W. Biederman
  0 siblings, 1 reply; 8+ messages in thread
From: Linus Torvalds @ 2006-04-11 18:23 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Junio C Hamano, git

On Mon, 10 Apr 2006, Eric W. Biederman wrote:
> 
> So at a quick inspection it looks to me like:
> About .059s to perform to check for missing files.
> About .019s to write the new tree.
> About .155s in start up overhead, read_cache, and sanity checks.
> 
> So at a first glance it looks like librification to
> allow the redundant work to be skipped, is where
> the big speed win on my machine would be.

That sounded wrong to me, so I did a stupid patch to datestamp the 
different phases of git-write-tree, and here's what it says for me:

     0.000479 setup_git_directory
     0.008333 read_cache
     0.000813 ce_stage check
     0.001838 tree validity check
     0.037233 write_tree itself

	real    0m0.051s
	user    0m0.044s
	sys     0m0.008s

all times are in seconds. 

There is some overhead from the actual process startup (the timestamp 
numbers add up to 0.048696 seconds, which is less than the 0.051 reported 
by "time" - since I didn't datestamp everything), but the biggest chunk by 
far (about three quarters of the total time, including _all_ the setup 
like executing the process) is the actual call to write_tree() itself.

So it probably wouldn't actually be that big a win performance-wise to 
make write_tree() a library and call it directly from git-apply with some 
flag.

To really speed up write-tree, you'd have to know which trees to write, 
and just skip the rest (and know what SHA1's the ones you skipped had: 
it's not enough to just skip them, since you need the SHA1's of even the 
trees you skipped to write the parent tree, and you _will_ change at 
least the top parent tree if you had a valid patch).

Which would imply pretty major surgery - you'd have to add the tree entry 
information to the index file, and make sure they got invalidated properly 
(all the way to the root) whenever adding/deleting/updating a path in the 
index file.

Quite frankly, I don't think it's really worth it.

Yes, it would speed up applying of huge patch-sets, but it's not like 
we're really slow at that even now, and I suspect you'd be better off 
trying to either live with it, or trying to see if you could change your 
workflow. There clearly _are_ tools that are better at handling pure 
patches, with quilt being the obvious example.

I routinely apply 100-200 patches in a go, and that's fast enough to not 
even be an issue. Yes, I have reasonably fast hardware, but we're likely 
talking thousands of patches in a series for it to be _really_ painful 
even on pretty basic developer hardware. Even a slow machine should do a 
few hundred patches in a couple of minutes.

Maybe enough time to get a cup of coffee, but no more than it would take 
to compile the project.

			Linus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Implement limited context matching in git-apply.
  2006-04-11 18:23         ` Linus Torvalds
@ 2006-04-13 12:02           ` Eric W. Biederman
  0 siblings, 0 replies; 8+ messages in thread
From: Eric W. Biederman @ 2006-04-13 12:02 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Junio C Hamano, git

Linus Torvalds <torvalds@osdl.org> writes:

> On Mon, 10 Apr 2006, Eric W. Biederman wrote:
>> 
>> So at a quick inspection it looks to me like:
>> About .059s to perform to check for missing files.
>> About .019s to write the new tree.
>> About .155s in start up overhead, read_cache, and sanity checks.
>> 
>> So at a first glance it looks like librification to
>> allow the redundant work to be skipped, is where
>> the big speed win on my machine would be.
>
> That sounded wrong to me, so I did a stupid patch to datestamp the 
> different phases of git-write-tree, and here's what it says for me:
>
>      0.000479 setup_git_directory
>      0.008333 read_cache
>      0.000813 ce_stage check
>      0.001838 tree validity check
>      0.037233 write_tree itself
>
> 	real    0m0.051s
> 	user    0m0.044s
> 	sys     0m0.008s
>
> all times are in seconds. 

Ok.  This is interesting and probably reveals what is different
about my setup.  For you user+sys = real.  For me there was
a significant gap.  So it looks like for some reason I was not
succeeding in keeping .git/index hot in the page cache.

When you are I/O bound it does make sense for read_cache
to be the dominate time.  I just need to track what is up
with my machine that makes me I/O bound.  Having too little
ram is an obvious candidate but it is too simple.  Currently
out of 512M I only have 21M in the page cache which sounds
really low.  Something for me to look at.

> Which would imply pretty major surgery - you'd have to add the tree entry 
> information to the index file, and make sure they got invalidated properly 
> (all the way to the root) whenever adding/deleting/updating a path in the 
> index file.
>
> Quite frankly, I don't think it's really worth it.

For the current size of the kernel tree I agree.

It is a potential scaling limitation and if someone starts
tracking really big tress with git it may be worth revisiting.

> Yes, it would speed up applying of huge patch-sets, but it's not like 
> we're really slow at that even now, and I suspect you'd be better off 
> trying to either live with it, or trying to see if you could change your 
> workflow. There clearly _are_ tools that are better at handling pure 
> patches, with quilt being the obvious example.

Probably.  For my workflow not having to switch tool chains is
the biggest win.  Which is part of what the -C is about.

> I routinely apply 100-200 patches in a go, and that's fast enough to not 
> even be an issue. Yes, I have reasonably fast hardware, but we're likely 
> talking thousands of patches in a series for it to be _really_ painful 
> even on pretty basic developer hardware. Even a slow machine should do a 
> few hundred patches in a couple of minutes.
>
> Maybe enough time to get a cup of coffee, but no more than it would take 
> to compile the project.

Agreed.  I did the analysis so I could understand what was going on.
If the analysis revealed low hanging fruit I would have plucked it.

Eric

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2006-04-13 12:04 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-10  2:41 [PATCH] Implement --fuzz= option for git-apply Eric W. Biederman
2006-04-10  5:52 ` Eric W. Biederman
2006-04-10  9:33   ` [PATCH] Implement limited context matching in git-apply Eric W. Biederman
2006-04-10 15:25     ` Linus Torvalds
2006-04-10 18:35       ` Eric W. Biederman
2006-04-11 18:23         ` Linus Torvalds
2006-04-13 12:02           ` Eric W. Biederman
2006-04-10 19:29     ` Junio C Hamano

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox