[PATCH 0/4] Pickaxe search clean-up and optimization

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/4] Pickaxe search clean-up and optimization
@ 2009-02-26  6:52 Junio C Hamano
  2009-02-26  6:52 ` [PATCH 1/4] diffcore-pickaxe: refactor diffcore_pickaxe() Junio C Hamano
                   ` (4 more replies)
  0 siblings, 5 replies; 23+ messages in thread
From: Junio C Hamano @ 2009-02-26  6:52 UTC (permalink / raw)
  To: git

This little series refactors the diffcore-pickaxe a bit and then optimizes
its search for "needle" inside the haystacks.

Junio C Hamano (4):
  diffcore-pickaxe: refactor diffcore_pickaxe()
  diffcore-pickaxe: micro-optimize has_match() function
  diffcore-pickaxe: further refactor count_match()
  diffcore-pickaxe: optimize by trimming common initial and trailing
    parts

 diffcore-pickaxe.c |  176 +++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 118 insertions(+), 58 deletions(-)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/4] diffcore-pickaxe: refactor diffcore_pickaxe()
  2009-02-26  6:52 [PATCH 0/4] Pickaxe search clean-up and optimization Junio C Hamano
@ 2009-02-26  6:52 ` Junio C Hamano
  2009-02-27 23:58   ` René Scharfe
  2009-02-26  6:52 ` [PATCH 2/4] diffcore-pickaxe: micro-optimize has_match() function Junio C Hamano
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2009-02-26  6:52 UTC (permalink / raw)
  To: git

Introduce pickaxe_match_one() to remove duplicated code that decides if a
filepair is "interesting enough" for the purpose of pickaxe from the main
loop of the function.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 diffcore-pickaxe.c |   83 ++++++++++++++++++++++++++-------------------------
 1 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index af9fffe..d27e725 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -5,9 +5,9 @@
 #include "diff.h"
 #include "diffcore.h"
 
-static unsigned int contains(struct diff_filespec *one,
-			     const char *needle, unsigned long len,
-			     regex_t *regexp)
+static unsigned int count_match(struct diff_filespec *one,
+				const char *needle, unsigned long len,
+				regex_t *regexp)
 {
 	unsigned int cnt;
 	unsigned long offset, sz;
@@ -48,6 +48,38 @@ static unsigned int contains(struct diff_filespec *one,
 	return cnt;
 }
 
+static int has_match(struct diff_filespec *one,
+		     const char *needle, unsigned long len,
+		     regex_t *regexp)
+{
+	return !!count_match(one, needle, len, regexp);
+}
+
+static int has_different_matches(struct diff_filepair *p,
+				 const char *needle, unsigned long len,
+				 regex_t *regexp)
+{
+	return (count_match(p->one, needle, len, regexp)
+		!= count_match(p->two, needle, len, regexp));
+
+}
+
+static int pickaxe_match_one(struct diff_filepair *p,
+			     const char *needle, unsigned long len,
+			     regex_t *regexp)
+{
+	if (!DIFF_FILE_VALID(p->one)) {
+		if (!DIFF_FILE_VALID(p->two))
+			return 0;
+		return has_match(p->two, needle, len, regexp);
+	} else if (!DIFF_FILE_VALID(p->two))
+		return has_match(p->one, needle, len, regexp);
+	else if (diff_unmodified_pair(p))
+		return 0;
+	else
+		return has_different_matches(p, needle, len, regexp);
+}
+
 void diffcore_pickaxe(const char *needle, int opts)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
@@ -75,29 +107,14 @@ void diffcore_pickaxe(const char *needle, int opts)
 		/* Showing the whole changeset if needle exists */
 		for (i = has_changes = 0; !has_changes && i < q->nr; i++) {
 			struct diff_filepair *p = q->queue[i];
-			if (!DIFF_FILE_VALID(p->one)) {
-				if (!DIFF_FILE_VALID(p->two))
-					continue; /* ignore unmerged */
-				/* created */
-				if (contains(p->two, needle, len, regexp))
-					has_changes++;
-			}
-			else if (!DIFF_FILE_VALID(p->two)) {
-				if (contains(p->one, needle, len, regexp))
-					has_changes++;
-			}
-			else if (!diff_unmodified_pair(p) &&
-				 contains(p->one, needle, len, regexp) !=
-				 contains(p->two, needle, len, regexp))
-				has_changes++;
+			if (pickaxe_match_one(p, needle, len, regexp))
+				return; /* not munge the queue */
 		}
-		if (has_changes)
-			return; /* not munge the queue */
 
-		/* otherwise we will clear the whole queue
-		 * by copying the empty outq at the end of this
-		 * function, but first clear the current entries
-		 * in the queue.
+		/*
+		 * otherwise we will clear the whole queue by copying
+		 * the empty outq at the end of this function, but
+		 * first clear the current entries in the queue.
 		 */
 		for (i = 0; i < q->nr; i++)
 			diff_free_filepair(q->queue[i]);
@@ -106,24 +123,8 @@ void diffcore_pickaxe(const char *needle, int opts)
 		/* Showing only the filepairs that has the needle */
 		for (i = 0; i < q->nr; i++) {
 			struct diff_filepair *p = q->queue[i];
-			has_changes = 0;
-			if (!DIFF_FILE_VALID(p->one)) {
-				if (!DIFF_FILE_VALID(p->two))
-					; /* ignore unmerged */
-				/* created */
-				else if (contains(p->two, needle, len, regexp))
-					has_changes = 1;
-			}
-			else if (!DIFF_FILE_VALID(p->two)) {
-				if (contains(p->one, needle, len, regexp))
-					has_changes = 1;
-			}
-			else if (!diff_unmodified_pair(p) &&
-				 contains(p->one, needle, len, regexp) !=
-				 contains(p->two, needle, len, regexp))
-				has_changes = 1;
 
-			if (has_changes)
+			if (pickaxe_match_one(p, needle, len, regexp))
 				diff_q(&outq, p);
 			else
 				diff_free_filepair(p);
-- 
1.6.2.rc2.91.gf9a36

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/4] diffcore-pickaxe: refactor diffcore_pickaxe()
  2009-02-26  6:52 ` [PATCH 1/4] diffcore-pickaxe: refactor diffcore_pickaxe() Junio C Hamano
@ 2009-02-27 23:58   ` René Scharfe
  0 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2009-02-27 23:58 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano schrieb:
>  void diffcore_pickaxe(const char *needle, int opts)
>  {
>  	struct diff_queue_struct *q = &diff_queued_diff;
> @@ -75,29 +107,14 @@ void diffcore_pickaxe(const char *needle, int opts)
>  		/* Showing the whole changeset if needle exists */
>  		for (i = has_changes = 0; !has_changes && i < q->nr; i++) {
>  			struct diff_filepair *p = q->queue[i];
> -			if (!DIFF_FILE_VALID(p->one)) {
> -				if (!DIFF_FILE_VALID(p->two))
> -					continue; /* ignore unmerged */
> -				/* created */
> -				if (contains(p->two, needle, len, regexp))
> -					has_changes++;
> -			}
> -			else if (!DIFF_FILE_VALID(p->two)) {
> -				if (contains(p->one, needle, len, regexp))
> -					has_changes++;
> -			}
> -			else if (!diff_unmodified_pair(p) &&
> -				 contains(p->one, needle, len, regexp) !=
> -				 contains(p->two, needle, len, regexp))
> -				has_changes++;
> +			if (pickaxe_match_one(p, needle, len, regexp))
> +				return; /* not munge the queue */
>  		}
> -		if (has_changes)
> -			return; /* not munge the queue */
>  
> -		/* otherwise we will clear the whole queue
> -		 * by copying the empty outq at the end of this
> -		 * function, but first clear the current entries
> -		 * in the queue.
> +		/*
> +		 * otherwise we will clear the whole queue by copying
> +		 * the empty outq at the end of this function, but
> +		 * first clear the current entries in the queue.
>  		 */
>  		for (i = 0; i < q->nr; i++)
>  			diff_free_filepair(q->queue[i]);
> @@ -106,24 +123,8 @@ void diffcore_pickaxe(const char *needle, int opts)
>  		/* Showing only the filepairs that has the needle */
>  		for (i = 0; i < q->nr; i++) {
>  			struct diff_filepair *p = q->queue[i];
> -			has_changes = 0;
> -			if (!DIFF_FILE_VALID(p->one)) {
> -				if (!DIFF_FILE_VALID(p->two))
> -					; /* ignore unmerged */
> -				/* created */
> -				else if (contains(p->two, needle, len, regexp))
> -					has_changes = 1;
> -			}
> -			else if (!DIFF_FILE_VALID(p->two)) {
> -				if (contains(p->one, needle, len, regexp))
> -					has_changes = 1;
> -			}
> -			else if (!diff_unmodified_pair(p) &&
> -				 contains(p->one, needle, len, regexp) !=
> -				 contains(p->two, needle, len, regexp))
> -				has_changes = 1;
>  
> -			if (has_changes)
> +			if (pickaxe_match_one(p, needle, len, regexp))
>  				diff_q(&outq, p);
>  			else
>  				diff_free_filepair(p);

You might want to squash this in, because has_changes is now constantly set
to zero:

diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index d27e725..d294d29 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -84,7 +84,7 @@ void diffcore_pickaxe(const char *needle, int opts)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	unsigned long len = strlen(needle);
-	int i, has_changes;
+	int i;
 	regex_t regex, *regexp = NULL;
 	struct diff_queue_struct outq;
 	outq.queue = NULL;
@@ -105,7 +105,7 @@ void diffcore_pickaxe(const char *needle, int opts)
 
 	if (opts & DIFF_PICKAXE_ALL) {
 		/* Showing the whole changeset if needle exists */
-		for (i = has_changes = 0; !has_changes && i < q->nr; i++) {
+		for (i = 0; i < q->nr; i++) {
 			struct diff_filepair *p = q->queue[i];
 			if (pickaxe_match_one(p, needle, len, regexp))
 				return; /* not munge the queue */

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 2/4] diffcore-pickaxe: micro-optimize has_match() function
  2009-02-26  6:52 [PATCH 0/4] Pickaxe search clean-up and optimization Junio C Hamano
  2009-02-26  6:52 ` [PATCH 1/4] diffcore-pickaxe: refactor diffcore_pickaxe() Junio C Hamano
@ 2009-02-26  6:52 ` Junio C Hamano
  2009-02-26  6:52 ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 23+ messages in thread
From: Junio C Hamano @ 2009-02-26  6:52 UTC (permalink / raw)
  To: git

When we are looking at an event that creates (or removes) a path, we only
need to see if the created (or removed) blob has an instance of the needle
we are looking for.  There is no need to count, once we know there is one.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 diffcore-pickaxe.c |   13 +++++++++----
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index d27e725..007b39c 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -5,7 +5,8 @@
 #include "diff.h"
 #include "diffcore.h"
 
-static unsigned int count_match(struct diff_filespec *one,
+static unsigned int count_match(int one_or_more,
+				struct diff_filespec *one,
 				const char *needle, unsigned long len,
 				regex_t *regexp)
 {
@@ -30,6 +31,8 @@ static unsigned int count_match(struct diff_filespec *one,
 			data += regmatch.rm_so;
 			if (*data) data++;
 			cnt++;
+			if (one_or_more)
+				break;
 		}
 
 	} else { /* Classic exact string match */
@@ -41,6 +44,8 @@ static unsigned int count_match(struct diff_filespec *one,
 			if (!memcmp(needle, data + offset, len)) {
 				offset += len - 1;
 				cnt++;
+				if (one_or_more)
+					break;
 			}
 		}
 	}
@@ -52,15 +57,15 @@ static int has_match(struct diff_filespec *one,
 		     const char *needle, unsigned long len,
 		     regex_t *regexp)
 {
-	return !!count_match(one, needle, len, regexp);
+	return count_match(1, one, needle, len, regexp);
 }
 
 static int has_different_matches(struct diff_filepair *p,
 				 const char *needle, unsigned long len,
 				 regex_t *regexp)
 {
-	return (count_match(p->one, needle, len, regexp)
-		!= count_match(p->two, needle, len, regexp));
+	return (count_match(0, p->one, needle, len, regexp)
+		!= count_match(0, p->two, needle, len, regexp));
 
 }
 
-- 
1.6.2.rc2.91.gf9a36

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-26  6:52 [PATCH 0/4] Pickaxe search clean-up and optimization Junio C Hamano
  2009-02-26  6:52 ` [PATCH 1/4] diffcore-pickaxe: refactor diffcore_pickaxe() Junio C Hamano
  2009-02-26  6:52 ` [PATCH 2/4] diffcore-pickaxe: micro-optimize has_match() function Junio C Hamano
@ 2009-02-26  6:52 ` Junio C Hamano
  2009-02-26  7:23   ` Kjetil Barvik
  2009-02-28  1:13   ` René Scharfe
  2009-02-26  6:52 ` [PATCH 4/4] diffcore-pickaxe: optimize by trimming common initial and trailing parts Junio C Hamano
  2009-03-02 23:00 ` [PATCH 1/2] diffcore-pickaxe: use memmem() René Scharfe
  4 siblings, 2 replies; 23+ messages in thread
From: Junio C Hamano @ 2009-02-26  6:52 UTC (permalink / raw)
  To: git

We separate out the part that prepares the blob data to be inspected and
the part that does the actual counting in the data.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 diffcore-pickaxe.c |   46 +++++++++++++++++++++++++---------------------
 1 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index 007b39c..4a0dca1 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -5,22 +5,13 @@
 #include "diff.h"
 #include "diffcore.h"
 
-static unsigned int count_match(int one_or_more,
-				struct diff_filespec *one,
-				const char *needle, unsigned long len,
-				regex_t *regexp)
+static unsigned int count_match_internal(int one_or_more,
+					 const char *data, unsigned long sz,
+					 const char *needle, unsigned long len,
+					 regex_t *regexp)
 {
-	unsigned int cnt;
-	unsigned long offset, sz;
-	const char *data;
-	if (diff_populate_filespec(one, 0))
-		return 0;
-	if (!len)
-		return 0;
-
-	sz = one->size;
-	data = one->data;
-	cnt = 0;
+	unsigned int cnt = 0;
+	unsigned long offset;
 
 	if (regexp) {
 		regmatch_t regmatch;
@@ -29,16 +20,14 @@ static unsigned int count_match(int one_or_more,
 		while (*data && !regexec(regexp, data, 1, &regmatch, flags)) {
 			flags |= REG_NOTBOL;
 			data += regmatch.rm_so;
-			if (*data) data++;
+			if (*data)
+				data++;
 			cnt++;
 			if (one_or_more)
 				break;
 		}
-
-	} else { /* Classic exact string match */
-		/* Yes, I've heard of strstr(), but the thing is *data may
-		 * not be NUL terminated.  Sue me.
-		 */
+	} else {
+		/* data many not be NUL terminated; we cannot use strstr() */
 		for (offset = 0; offset + len <= sz; offset++) {
 			/* we count non-overlapping occurrences of needle */
 			if (!memcmp(needle, data + offset, len)) {
@@ -49,6 +38,21 @@ static unsigned int count_match(int one_or_more,
 			}
 		}
 	}
+	return cnt;
+}
+
+static unsigned int count_match(int one_or_more,
+				struct diff_filespec *one,
+				const char *needle, unsigned long len,
+				regex_t *regexp)
+{
+	unsigned int cnt;
+	if (diff_populate_filespec(one, 0))
+		return 0;
+	if (!len)
+		return 0;
+	cnt = count_match_internal(one_or_more, one->data, one->size,
+				   needle, len, regexp);
 	diff_free_filespec_data(one);
 	return cnt;
 }
-- 
1.6.2.rc2.91.gf9a36

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-26  6:52 ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
@ 2009-02-26  7:23   ` Kjetil Barvik
  2009-02-28  1:13   ` René Scharfe
  1 sibling, 0 replies; 23+ messages in thread
From: Kjetil Barvik @ 2009-02-26  7:23 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano <gitster@pobox.com> writes:

> -
> -	} else { /* Classic exact string match */
> -		/* Yes, I've heard of strstr(), but the thing is *data may
> -		 * not be NUL terminated.  Sue me.
> -		 */
> +	} else {
> +		/* data many not be NUL terminated; we cannot use strstr() */

  spellfix: s/many/may/

  -- kjetil

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-26  6:52 ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
  2009-02-26  7:23   ` Kjetil Barvik
@ 2009-02-28  1:13   ` René Scharfe
  2009-02-28  1:25     ` Junio C Hamano
  1 sibling, 1 reply; 23+ messages in thread
From: René Scharfe @ 2009-02-28  1:13 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano schrieb:
> -static unsigned int count_match(int one_or_more,
> -				struct diff_filespec *one,
> -				const char *needle, unsigned long len,
> -				regex_t *regexp)
> +static unsigned int count_match_internal(int one_or_more,
> +					 const char *data, unsigned long sz,
> +					 const char *needle, unsigned long len,
> +					 regex_t *regexp)
>  {

I don't especially like flags like one_or_more.  Having two functions
(which possibly call a common function that does most of the work) is
nicer.  And it's a bit sad here because there already are two functions
with nice names: count_match() and has_match().  But, well, since the
functions are not exported anyway it doesn't matter much.

> @@ -29,16 +20,14 @@ static unsigned int count_match(int one_or_more,
>  		while (*data && !regexec(regexp, data, 1, &regmatch, flags)) {
>  			flags |= REG_NOTBOL;
>  			data += regmatch.rm_so;
> -			if (*data) data++;
> +			if (*data)
> +				data++;
>  			cnt++;
>  			if (one_or_more)
>  				break;
>  		}
> -
> -	} else { /* Classic exact string match */
> -		/* Yes, I've heard of strstr(), but the thing is *data may
> -		 * not be NUL terminated.  Sue me.
> -		 */
> +	} else {
> +		/* data many not be NUL terminated; we cannot use strstr() */

That looks fishy to me.  regexec() expects data to be a NUL-terminated
string, so either the comment is wrong or the regexp case needs to take
better care to add a NUL at the end of the buffer.

In any case, there is also memmem(), which uses the same fast algorithm
as strstr() in recent glibc versions.  Like this?


diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index f4870b4..4c19967 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -11,7 +11,6 @@ static unsigned int count_match_internal(int one_or_more,
 					 regex_t *regexp)
 {
 	unsigned int cnt = 0;
-	unsigned long offset;
 
 	if (regexp) {
 		regmatch_t regmatch;
@@ -27,15 +26,15 @@ static unsigned int count_match_internal(int one_or_more,
 				break;
 		}
 	} else {
-		/* data many not be NUL terminated; we cannot use strstr() */
-		for (offset = 0; offset + len <= sz; offset++) {
-			/* we count non-overlapping occurrences of needle */
-			if (!memcmp(needle, data + offset, len)) {
-				offset += len - 1;
-				cnt++;
-				if (one_or_more)
-					break;
-			}
+		while (sz) {
+			const char *found = memmem(data, sz, needle, len);
+			if (!found)
+				break;
+			sz -= found - data + len;
+			data = found + len;
+			cnt++;
+			if (one_or_more)
+				break;
 		}
 	}
 	return cnt;

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-28  1:13   ` René Scharfe
@ 2009-02-28  1:25     ` Junio C Hamano
  2009-02-28  6:08       ` Junio C Hamano
  0 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2009-02-28  1:25 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:

>> -
>> -	} else { /* Classic exact string match */
>> -		/* Yes, I've heard of strstr(), but the thing is *data may
>> -		 * not be NUL terminated.  Sue me.
>> -		 */
>> +	} else {
>> +		/* data many not be NUL terminated; we cannot use strstr() */
>
> That looks fishy to me.  regexec() expects data to be a NUL-terminated
> string, so either the comment is wrong or the regexp case needs to take
> better care to add a NUL at the end of the buffer.

Probably yes, but regexp side is not my code and I never use it, so... ;-)

> In any case, there is also memmem(), which uses the same fast algorithm
> as strstr() in recent glibc versions.  Like this?

Thanks; it would be nice to bench this change.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-28  1:25     ` Junio C Hamano
@ 2009-02-28  6:08       ` Junio C Hamano
  2009-02-28 13:10         ` René Scharfe
  0 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2009-02-28  6:08 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

Junio C Hamano <gitster@pobox.com> writes:

> René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:
> ...
>> In any case, there is also memmem(), which uses the same fast algorithm
>> as strstr() in recent glibc versions.  Like this?
>
> Thanks; it would be nice to bench this change.

With memmem() patch applied on top of [1-4/4], the same test as described
in the commit log message of [4/4] which was:

    $ STRING='Ensure that the real time constraints are schedulable.'
    $ git log -S"$STRING" HEAD -- kernel/sched.c >/dev/null

    (Before the patch, best of 5 runs)
    5.59user 0.15system 0:05.74elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
    0inputs+0outputs (0major+39956minor)pagefaults 0swaps

    (After the patch, best of 5 runs)
    3.04user 0.17system 0:03.23elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
    0inputs+0outputs (0major+49697minor)pagefaults 0swaps

    The file "kernel/sched.c" has roughly 900 changes applied to it, and over
    its lifetime, it has grown from 5kB to 9kB in size.  I'd expect a larger
    file may see a bigger performance boost.

(With memmem() patch, best of 5 runs)
2.46user 0.15system 0:02.62elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+49698minor)pagefaults 0swaps

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-28  6:08       ` Junio C Hamano
@ 2009-02-28 13:10         ` René Scharfe
  2009-02-28 17:40           ` Junio C Hamano
  2009-03-01  7:31           ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
  0 siblings, 2 replies; 23+ messages in thread
From: René Scharfe @ 2009-02-28 13:10 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano schrieb:
> Junio C Hamano <gitster@pobox.com> writes:
> 
>> René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:
>> ...
>>> In any case, there is also memmem(), which uses the same fast algorithm
>>> as strstr() in recent glibc versions.  Like this?
>> Thanks; it would be nice to bench this change.
> 
> With memmem() patch applied on top of [1-4/4], the same test as described
> in the commit log message of [4/4] which was:
> 
>     $ STRING='Ensure that the real time constraints are schedulable.'
>     $ git log -S"$STRING" HEAD -- kernel/sched.c >/dev/null
> 
>     (Before the patch, best of 5 runs)
>     5.59user 0.15system 0:05.74elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
>     0inputs+0outputs (0major+39956minor)pagefaults 0swaps
> 
>     (After the patch, best of 5 runs)
>     3.04user 0.17system 0:03.23elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
>     0inputs+0outputs (0major+49697minor)pagefaults 0swaps
> 
>     The file "kernel/sched.c" has roughly 900 changes applied to it, and over
>     its lifetime, it has grown from 5kB to 9kB in size.  I'd expect a larger
>     file may see a bigger performance boost.
> 
> (With memmem() patch, best of 5 runs)
> 2.46user 0.15system 0:02.62elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
> 0inputs+0outputs (0major+49698minor)pagefaults 0swaps

I get this (Ubuntu 8.10 x64, Fedora 10 x64 using the same Linux repo,
Windows Vista x64 using a different Linux repo with the same HEAD on
NTFS and msysgit, numbers are the elapsed time in seconds, best of five
runs):

                           Ubuntu  Fedora  Windows
   v1.6.2-rc2                8.14    8.16    9.236
   v1.6.2-rc2+[1-4]          2.43    2.45    2.995
   v1.6.2-rc2+[1-4]+memmem   1.31    1.25    2.917
   v1.6.2-rc2+[1-3]+memmem   1.51    1.16    8.455

Ubuntu has glibc 2.8, while Fedora 10 has glibc 2.9, with a new and more
efficient memmem() implementation.  On Windows, we use our own naive
memmem() implementation.

So using memmem() is worthwhile.  And providing a better fall-back
version in compat/ can speed up this particular case to the point where
the fourth patch becomes moot.

Hmm, gnulib (http://git.savannah.gnu.org/gitweb/?p=gnulib.git;a=summary)
contains all parts ready for copy & paste, licensed under the GPL 2 or
up.  That won't cause problems with the libgit2 relicensing effort, as
memmem()  won't end up in there, right?


For the record, here the raw timings used to make the table above (best
of five):

Fedora 10:
8.10user 0.05system 0:08.16elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+31943minor)pagefaults 0swaps

2.38user 0.06system 0:02.45elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+41294minor)pagefaults 0swaps

1.19user 0.05system 0:01.25elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+39274minor)pagefaults 0swaps

1.10user 0.05system 0:01.16elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+31796minor)pagefaults 0swaps

Ubuntu 8.10:
8.08user 0.05system 0:08.14elapsed 100%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+29579minor)pagefaults 0swaps

2.35user 0.07system 0:02.43elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+40437minor)pagefaults 0swaps

1.23user 0.08system 0:01.31elapsed 100%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+41097minor)pagefaults 0swaps

1.46user 0.05system 0:01.51elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+30533minor)pagefaults 0swaps

Windows:
real    0m9.236s
user    0m0.000s
sys     0m0.000s

real    0m2.995s
user    0m0.000s
sys     0m0.000s

real    0m2.917s
user    0m0.000s
sys     0m0.015s

real    0m8.455s
user    0m0.000s
sys     0m0.000s

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-28 13:10         ` René Scharfe
@ 2009-02-28 17:40           ` Junio C Hamano
  2009-02-28 18:15             ` René Scharfe
  2009-02-28 19:16             ` [PATCH] import memmem() with linear complexity from Gnulib René Scharfe
  2009-03-01  7:31           ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
  1 sibling, 2 replies; 23+ messages in thread
From: Junio C Hamano @ 2009-02-28 17:40 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:

> I get this (Ubuntu 8.10 x64, Fedora 10 x64 using the same Linux repo,
> Windows Vista x64 using a different Linux repo with the same HEAD on
> NTFS and msysgit, numbers are the elapsed time in seconds, best of five
> runs):
>
>                            Ubuntu  Fedora  Windows
>    v1.6.2-rc2                8.14    8.16    9.236
>    v1.6.2-rc2+[1-4]          2.43    2.45    2.995
>    v1.6.2-rc2+[1-4]+memmem   1.31    1.25    2.917
>    v1.6.2-rc2+[1-3]+memmem   1.51    1.16    8.455
>
> Ubuntu has glibc 2.8, while Fedora 10 has glibc 2.9, with a new and more
> efficient memmem() implementation.  On Windows, we use our own naive
> memmem() implementation.

Yeah, what does glibc use these days?  Some variant of Boyer-Moore?

> So using memmem() is worthwhile.  And providing a better fall-back
> version in compat/ can speed up this particular case to the point where
> the fourth patch becomes moot.
>
> Hmm, gnulib (http://git.savannah.gnu.org/gitweb/?p=gnulib.git;a=summary)
> contains all parts ready for copy & paste, licensed under the GPL 2 or
> up.  That won't cause problems with the libgit2 relicensing effort, as
> memmem()  won't end up in there, right?

Correct.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-28 17:40           ` Junio C Hamano
@ 2009-02-28 18:15             ` René Scharfe
  2009-02-28 19:16             ` [PATCH] import memmem() with linear complexity from Gnulib René Scharfe
  1 sibling, 0 replies; 23+ messages in thread
From: René Scharfe @ 2009-02-28 18:15 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano schrieb:
> René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:
> 
>> I get this (Ubuntu 8.10 x64, Fedora 10 x64 using the same Linux repo,
>> Windows Vista x64 using a different Linux repo with the same HEAD on
>> NTFS and msysgit, numbers are the elapsed time in seconds, best of five
>> runs):
>>
>>                            Ubuntu  Fedora  Windows
>>    v1.6.2-rc2                8.14    8.16    9.236
>>    v1.6.2-rc2+[1-4]          2.43    2.45    2.995
>>    v1.6.2-rc2+[1-4]+memmem   1.31    1.25    2.917
>>    v1.6.2-rc2+[1-3]+memmem   1.51    1.16    8.455
>>
>> Ubuntu has glibc 2.8, while Fedora 10 has glibc 2.9, with a new and more
>> efficient memmem() implementation.  On Windows, we use our own naive
>> memmem() implementation.
> 
> Yeah, what does glibc use these days?  Some variant of Boyer-Moore?

No, the algorithm is called Two Way, which, unlike Boyer-Moore, only
needs constant space.  The implementation seems to originate from this bug:

	http://sourceware.org/bugzilla/show_bug.cgi?id=5514

And the algorithm is documented here:

	http://www-igm.univ-mlv.fr/~lecroq/string/node26.html

René

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] import memmem() with linear complexity from Gnulib
  2009-02-28 17:40           ` Junio C Hamano
  2009-02-28 18:15             ` René Scharfe
@ 2009-02-28 19:16             ` René Scharfe
  2009-02-28 22:44               ` Mike Hommey
  1 sibling, 1 reply; 23+ messages in thread
From: René Scharfe @ 2009-02-28 19:16 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Gnulib and glibc have gained a memmem() implementation using the Two-Way
algorithm, which needs constant space and linear time.  Import it to
compat/ in order to replace the simple quadratic implementation there.

memmem.c and str-two-way.h are copied verbatim from the repository at
git://git.savannah.gnu.org/gnulib.git, with the following changes to
memmem.c to make it fit into git's build environment:

	21,23c21
	< #ifndef _LIBC
	< # include <config.h>
	< #endif
	---
	> #include "../git-compat-util.h"
	40c38
	< memmem (const void *haystack_start, size_t haystack_len,
	---
	> gitmemmem(const void *haystack_start, size_t haystack_len,

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 Makefile             |    1 +
 compat/memmem.c      |  103 +++++++++----
 compat/str-two-way.h |  429 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 504 insertions(+), 29 deletions(-)
 rewrite compat/memmem.c (91%)
 create mode 100644 compat/str-two-way.h

diff --git a/Makefile b/Makefile
index 0675c43..b2b15d9 100644
--- a/Makefile
+++ b/Makefile
@@ -359,6 +359,7 @@ LIB_H += cache-tree.h
 LIB_H += commit.h
 LIB_H += compat/cygwin.h
 LIB_H += compat/mingw.h
+LIB_H += compat/str-two-way.h
 LIB_H += csum-file.h
 LIB_H += decorate.h
 LIB_H += delta.h
diff --git a/compat/memmem.c b/compat/memmem.c
dissimilarity index 91%
index cd0d877..b0b7821 100644
--- a/compat/memmem.c
+++ b/compat/memmem.c
@@ -1,29 +1,74 @@
-#include "../git-compat-util.h"
-
-void *gitmemmem(const void *haystack, size_t haystack_len,
-                const void *needle, size_t needle_len)
-{
-	const char *begin = haystack;
-	const char *last_possible = begin + haystack_len - needle_len;
-
-	/*
-	 * The first occurrence of the empty string is deemed to occur at
-	 * the beginning of the string.
-	 */
-	if (needle_len == 0)
-		return (void *)begin;
-
-	/*
-	 * Sanity check, otherwise the loop might search through the whole
-	 * memory.
-	 */
-	if (haystack_len < needle_len)
-		return NULL;
-
-	for (; begin <= last_possible; begin++) {
-		if (!memcmp(begin, needle, needle_len))
-			return (void *)begin;
-	}
-
-	return NULL;
-}
+/* Copyright (C) 1991,92,93,94,96,97,98,2000,2004,2007,2008 Free Software
+   Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License along
+   with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* This particular implementation was written by Eric Blake, 2008.  */
+
+#include "../git-compat-util.h"
+
+/* Specification of memmem.  */
+#include <string.h>
+
+#ifndef _LIBC
+# define __builtin_expect(expr, val)   (expr)
+#endif
+
+#define RETURN_TYPE void *
+#define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
+#include "str-two-way.h"
+
+/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
+   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
+   HAYSTACK.  */
+void *
+gitmemmem(const void *haystack_start, size_t haystack_len,
+	const void *needle_start, size_t needle_len)
+{
+  /* Abstract memory is considered to be an array of 'unsigned char' values,
+     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
+  const unsigned char *haystack = (const unsigned char *) haystack_start;
+  const unsigned char *needle = (const unsigned char *) needle_start;
+
+  if (needle_len == 0)
+    /* The first occurrence of the empty string is deemed to occur at
+       the beginning of the string.  */
+    return (void *) haystack;
+
+  /* Sanity check, otherwise the loop might search through the whole
+     memory.  */
+  if (__builtin_expect (haystack_len < needle_len, 0))
+    return NULL;
+
+  /* Use optimizations in memchr when possible, to reduce the search
+     size of haystack using a linear algorithm with a smaller
+     coefficient.  However, avoid memchr for long needles, since we
+     can often achieve sublinear performance.  */
+  if (needle_len < LONG_NEEDLE_THRESHOLD)
+    {
+      haystack = memchr (haystack, *needle, haystack_len);
+      if (!haystack || __builtin_expect (needle_len == 1, 0))
+	return (void *) haystack;
+      haystack_len -= haystack - (const unsigned char *) haystack_start;
+      if (haystack_len < needle_len)
+	return NULL;
+      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+    }
+  else
+    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+}
+
+#undef LONG_NEEDLE_THRESHOLD
diff --git a/compat/str-two-way.h b/compat/str-two-way.h
new file mode 100644
index 0000000..b0338a7
--- /dev/null
+++ b/compat/str-two-way.h
@@ -0,0 +1,429 @@
+/* Byte-wise substring search, using the Two-Way algorithm.
+   Copyright (C) 2008 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Written by Eric Blake <ebb9@byu.net>, 2008.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License along
+   with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* Before including this file, you need to include <config.h> and
+   <string.h>, and define:
+     RESULT_TYPE             A macro that expands to the return type.
+     AVAILABLE(h, h_l, j, n_l)
+			     A macro that returns nonzero if there are
+			     at least N_L bytes left starting at H[J].
+			     H is 'unsigned char *', H_L, J, and N_L
+			     are 'size_t'; H_L is an lvalue.  For
+			     NUL-terminated searches, H_L can be
+			     modified each iteration to avoid having
+			     to compute the end of H up front.
+
+  For case-insensitivity, you may optionally define:
+     CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
+			     characters of P1 and P2 are equal.
+     CANON_ELEMENT(c)        A macro that canonicalizes an element right after
+			     it has been fetched from one of the two strings.
+			     The argument is an 'unsigned char'; the result
+			     must be an 'unsigned char' as well.
+
+  This file undefines the macros documented above, and defines
+  LONG_NEEDLE_THRESHOLD.
+*/
+
+#include <limits.h>
+#include <stdint.h>
+
+/* We use the Two-Way string matching algorithm, which guarantees
+   linear complexity with constant space.  Additionally, for long
+   needles, we also use a bad character shift table similar to the
+   Boyer-Moore algorithm to achieve improved (potentially sub-linear)
+   performance.
+
+   See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
+   and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
+*/
+
+/* Point at which computing a bad-byte shift table is likely to be
+   worthwhile.  Small needles should not compute a table, since it
+   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
+   speedup no greater than a factor of NEEDLE_LEN.  The larger the
+   needle, the better the potential performance gain.  On the other
+   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
+   memory required for the table is prohibitive.  */
+#if CHAR_BIT < 10
+# define LONG_NEEDLE_THRESHOLD 32U
+#else
+# define LONG_NEEDLE_THRESHOLD SIZE_MAX
+#endif
+
+#ifndef MAX
+# define MAX(a, b) ((a < b) ? (b) : (a))
+#endif
+
+#ifndef CANON_ELEMENT
+# define CANON_ELEMENT(c) c
+#endif
+#ifndef CMP_FUNC
+# define CMP_FUNC memcmp
+#endif
+
+/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
+   Return the index of the first byte in the right half, and set
+   *PERIOD to the global period of the right half.
+
+   The global period of a string is the smallest index (possibly its
+   length) at which all remaining bytes in the string are repetitions
+   of the prefix (the last repetition may be a subset of the prefix).
+
+   When NEEDLE is factored into two halves, a local period is the
+   length of the smallest word that shares a suffix with the left half
+   and shares a prefix with the right half.  All factorizations of a
+   non-empty NEEDLE have a local period of at least 1 and no greater
+   than NEEDLE_LEN.
+
+   A critical factorization has the property that the local period
+   equals the global period.  All strings have at least one critical
+   factorization with the left half smaller than the global period.
+
+   Given an ordered alphabet, a critical factorization can be computed
+   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
+   larger of two ordered maximal suffixes.  The ordered maximal
+   suffixes are determined by lexicographic comparison of
+   periodicity.  */
+static size_t
+critical_factorization (const unsigned char *needle, size_t needle_len,
+			size_t *period)
+{
+  /* Index of last byte of left half, or SIZE_MAX.  */
+  size_t max_suffix, max_suffix_rev;
+  size_t j; /* Index into NEEDLE for current candidate suffix.  */
+  size_t k; /* Offset into current period.  */
+  size_t p; /* Intermediate period.  */
+  unsigned char a, b; /* Current comparison bytes.  */
+
+  /* Invariants:
+     0 <= j < NEEDLE_LEN - 1
+     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
+     min(max_suffix, max_suffix_rev) < global period of NEEDLE
+     1 <= p <= global period of NEEDLE
+     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
+     1 <= k <= p
+  */
+
+  /* Perform lexicographic search.  */
+  max_suffix = SIZE_MAX;
+  j = 0;
+  k = p = 1;
+  while (j + k < needle_len)
+    {
+      a = CANON_ELEMENT (needle[j + k]);
+      b = CANON_ELEMENT (needle[max_suffix + k]);
+      if (a < b)
+	{
+	  /* Suffix is smaller, period is entire prefix so far.  */
+	  j += k;
+	  k = 1;
+	  p = j - max_suffix;
+	}
+      else if (a == b)
+	{
+	  /* Advance through repetition of the current period.  */
+	  if (k != p)
+	    ++k;
+	  else
+	    {
+	      j += p;
+	      k = 1;
+	    }
+	}
+      else /* b < a */
+	{
+	  /* Suffix is larger, start over from current location.  */
+	  max_suffix = j++;
+	  k = p = 1;
+	}
+    }
+  *period = p;
+
+  /* Perform reverse lexicographic search.  */
+  max_suffix_rev = SIZE_MAX;
+  j = 0;
+  k = p = 1;
+  while (j + k < needle_len)
+    {
+      a = CANON_ELEMENT (needle[j + k]);
+      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
+      if (b < a)
+	{
+	  /* Suffix is smaller, period is entire prefix so far.  */
+	  j += k;
+	  k = 1;
+	  p = j - max_suffix_rev;
+	}
+      else if (a == b)
+	{
+	  /* Advance through repetition of the current period.  */
+	  if (k != p)
+	    ++k;
+	  else
+	    {
+	      j += p;
+	      k = 1;
+	    }
+	}
+      else /* a < b */
+	{
+	  /* Suffix is larger, start over from current location.  */
+	  max_suffix_rev = j++;
+	  k = p = 1;
+	}
+    }
+
+  /* Choose the longer suffix.  Return the first byte of the right
+     half, rather than the last byte of the left half.  */
+  if (max_suffix_rev + 1 < max_suffix + 1)
+    return max_suffix + 1;
+  *period = p;
+  return max_suffix_rev + 1;
+}
+
+/* Return the first location of non-empty NEEDLE within HAYSTACK, or
+   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
+   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
+   Performance is guaranteed to be linear, with an initialization cost
+   of 2 * NEEDLE_LEN comparisons.
+
+   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
+   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
+   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
+   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
+static RETURN_TYPE
+two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
+		      const unsigned char *needle, size_t needle_len)
+{
+  size_t i; /* Index into current byte of NEEDLE.  */
+  size_t j; /* Index into current window of HAYSTACK.  */
+  size_t period; /* The period of the right half of needle.  */
+  size_t suffix; /* The index of the right half of needle.  */
+
+  /* Factor the needle into two halves, such that the left half is
+     smaller than the global period, and the right half is
+     periodic (with a period as large as NEEDLE_LEN - suffix).  */
+  suffix = critical_factorization (needle, needle_len, &period);
+
+  /* Perform the search.  Each iteration compares the right half
+     first.  */
+  if (CMP_FUNC (needle, needle + period, suffix) == 0)
+    {
+      /* Entire needle is periodic; a mismatch can only advance by the
+	 period, so use memory to avoid rescanning known occurrences
+	 of the period.  */
+      size_t memory = 0;
+      j = 0;
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
+	{
+	  /* Scan for matches in right half.  */
+	  i = MAX (suffix, memory);
+	  while (i < needle_len && (CANON_ELEMENT (needle[i])
+				    == CANON_ELEMENT (haystack[i + j])))
+	    ++i;
+	  if (needle_len <= i)
+	    {
+	      /* Scan for matches in left half.  */
+	      i = suffix - 1;
+	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
+					== CANON_ELEMENT (haystack[i + j])))
+		--i;
+	      if (i + 1 < memory + 1)
+		return (RETURN_TYPE) (haystack + j);
+	      /* No match, so remember how many repetitions of period
+		 on the right half were scanned.  */
+	      j += period;
+	      memory = needle_len - period;
+	    }
+	  else
+	    {
+	      j += i - suffix + 1;
+	      memory = 0;
+	    }
+	}
+    }
+  else
+    {
+      /* The two halves of needle are distinct; no extra memory is
+	 required, and any mismatch results in a maximal shift.  */
+      period = MAX (suffix, needle_len - suffix) + 1;
+      j = 0;
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
+	{
+	  /* Scan for matches in right half.  */
+	  i = suffix;
+	  while (i < needle_len && (CANON_ELEMENT (needle[i])
+				    == CANON_ELEMENT (haystack[i + j])))
+	    ++i;
+	  if (needle_len <= i)
+	    {
+	      /* Scan for matches in left half.  */
+	      i = suffix - 1;
+	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
+				       == CANON_ELEMENT (haystack[i + j])))
+		--i;
+	      if (i == SIZE_MAX)
+		return (RETURN_TYPE) (haystack + j);
+	      j += period;
+	    }
+	  else
+	    j += i - suffix + 1;
+	}
+    }
+  return NULL;
+}
+
+/* Return the first location of non-empty NEEDLE within HAYSTACK, or
+   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
+   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
+   Performance is guaranteed to be linear, with an initialization cost
+   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
+
+   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
+   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
+   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
+   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
+   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
+   sublinear performance is not possible.  */
+static RETURN_TYPE
+two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
+		     const unsigned char *needle, size_t needle_len)
+{
+  size_t i; /* Index into current byte of NEEDLE.  */
+  size_t j; /* Index into current window of HAYSTACK.  */
+  size_t period; /* The period of the right half of needle.  */
+  size_t suffix; /* The index of the right half of needle.  */
+  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
+
+  /* Factor the needle into two halves, such that the left half is
+     smaller than the global period, and the right half is
+     periodic (with a period as large as NEEDLE_LEN - suffix).  */
+  suffix = critical_factorization (needle, needle_len, &period);
+
+  /* Populate shift_table.  For each possible byte value c,
+     shift_table[c] is the distance from the last occurrence of c to
+     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
+     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
+  for (i = 0; i < 1U << CHAR_BIT; i++)
+    shift_table[i] = needle_len;
+  for (i = 0; i < needle_len; i++)
+    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
+
+  /* Perform the search.  Each iteration compares the right half
+     first.  */
+  if (CMP_FUNC (needle, needle + period, suffix) == 0)
+    {
+      /* Entire needle is periodic; a mismatch can only advance by the
+	 period, so use memory to avoid rescanning known occurrences
+	 of the period.  */
+      size_t memory = 0;
+      size_t shift;
+      j = 0;
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
+	{
+	  /* Check the last byte first; if it does not match, then
+	     shift to the next possible match location.  */
+	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
+	  if (0 < shift)
+	    {
+	      if (memory && shift < period)
+		{
+		  /* Since needle is periodic, but the last period has
+		     a byte out of place, there can be no match until
+		     after the mismatch.  */
+		  shift = needle_len - period;
+		  memory = 0;
+		}
+	      j += shift;
+	      continue;
+	    }
+	  /* Scan for matches in right half.  The last byte has
+	     already been matched, by virtue of the shift table.  */
+	  i = MAX (suffix, memory);
+	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
+					== CANON_ELEMENT (haystack[i + j])))
+	    ++i;
+	  if (needle_len - 1 <= i)
+	    {
+	      /* Scan for matches in left half.  */
+	      i = suffix - 1;
+	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
+					== CANON_ELEMENT (haystack[i + j])))
+		--i;
+	      if (i + 1 < memory + 1)
+		return (RETURN_TYPE) (haystack + j);
+	      /* No match, so remember how many repetitions of period
+		 on the right half were scanned.  */
+	      j += period;
+	      memory = needle_len - period;
+	    }
+	  else
+	    {
+	      j += i - suffix + 1;
+	      memory = 0;
+	    }
+	}
+    }
+  else
+    {
+      /* The two halves of needle are distinct; no extra memory is
+	 required, and any mismatch results in a maximal shift.  */
+      size_t shift;
+      period = MAX (suffix, needle_len - suffix) + 1;
+      j = 0;
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
+	{
+	  /* Check the last byte first; if it does not match, then
+	     shift to the next possible match location.  */
+	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
+	  if (0 < shift)
+	    {
+	      j += shift;
+	      continue;
+	    }
+	  /* Scan for matches in right half.  The last byte has
+	     already been matched, by virtue of the shift table.  */
+	  i = suffix;
+	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
+					== CANON_ELEMENT (haystack[i + j])))
+	    ++i;
+	  if (needle_len - 1 <= i)
+	    {
+	      /* Scan for matches in left half.  */
+	      i = suffix - 1;
+	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
+				       == CANON_ELEMENT (haystack[i + j])))
+		--i;
+	      if (i == SIZE_MAX)
+		return (RETURN_TYPE) (haystack + j);
+	      j += period;
+	    }
+	  else
+	    j += i - suffix + 1;
+	}
+    }
+  return NULL;
+}
+
+#undef AVAILABLE
+#undef CANON_ELEMENT
+#undef CMP_FUNC
+#undef MAX
+#undef RETURN_TYPE
-- 
1.6.2.rc2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] import memmem() with linear complexity from Gnulib
  2009-02-28 19:16             ` [PATCH] import memmem() with linear complexity from Gnulib René Scharfe
@ 2009-02-28 22:44               ` Mike Hommey
  2009-03-01  3:41                 ` Jeff King
  0 siblings, 1 reply; 23+ messages in thread
From: Mike Hommey @ 2009-02-28 22:44 UTC (permalink / raw)
  To: René Scharfe; +Cc: Junio C Hamano, git

On Sat, Feb 28, 2009 at 08:16:55PM +0100, René Scharfe wrote:
> Gnulib and glibc have gained a memmem() implementation using the Two-Way
> algorithm, which needs constant space and linear time.  Import it to
> compat/ in order to replace the simple quadratic implementation there.
> 
> memmem.c and str-two-way.h are copied verbatim from the repository at
> git://git.savannah.gnu.org/gnulib.git, with the following changes to
> memmem.c to make it fit into git's build environment:
> 
> 	21,23c21
> 	< #ifndef _LIBC
> 	< # include <config.h>
> 	< #endif
> 	---
> 	> #include "../git-compat-util.h"
> 	40c38
> 	< memmem (const void *haystack_start, size_t haystack_len,
> 	---
> 	> gitmemmem(const void *haystack_start, size_t haystack_len,
> 
> Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
> ---
>  Makefile             |    1 +
>  compat/memmem.c      |  103 +++++++++----
>  compat/str-two-way.h |  429 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 504 insertions(+), 29 deletions(-)

Seeing how much memmem is being used in the codebase, is it really worth?

Mike

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] import memmem() with linear complexity from Gnulib
  2009-02-28 22:44               ` Mike Hommey
@ 2009-03-01  3:41                 ` Jeff King
  2009-03-01 11:15                   ` René Scharfe
  0 siblings, 1 reply; 23+ messages in thread
From: Jeff King @ 2009-03-01  3:41 UTC (permalink / raw)
  To: Mike Hommey; +Cc: René Scharfe, Junio C Hamano, git

On Sat, Feb 28, 2009 at 11:44:01PM +0100, Mike Hommey wrote:

> > ---
> >  Makefile             |    1 +
> >  compat/memmem.c      |  103 +++++++++----
> >  compat/str-two-way.h |  429 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 504 insertions(+), 29 deletions(-)
> 
> Seeing how much memmem is being used in the codebase, is it really worth?

See earlier in the thread, where "git log -Stoken" is substantially
faster on Linux versus Windows (but some exact numbers before and after
on Windows would be nice to have in the commit message).

-Peff

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] import memmem() with linear complexity from Gnulib
  2009-03-01  3:41                 ` Jeff King
@ 2009-03-01 11:15                   ` René Scharfe
  2009-03-01 18:55                     ` René Scharfe
  0 siblings, 1 reply; 23+ messages in thread
From: René Scharfe @ 2009-03-01 11:15 UTC (permalink / raw)
  To: Jeff King; +Cc: Mike Hommey, Junio C Hamano, git

Jeff King schrieb:
> On Sat, Feb 28, 2009 at 11:44:01PM +0100, Mike Hommey wrote:
> 
>>> ---
>>>  Makefile             |    1 +
>>>  compat/memmem.c      |  103 +++++++++----
>>>  compat/str-two-way.h |  429 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  3 files changed, 504 insertions(+), 29 deletions(-)
>> Seeing how much memmem is being used in the codebase, is it really worth?
> 
> See earlier in the thread, where "git log -Stoken" is substantially
> faster on Linux versus Windows (but some exact numbers before and after
> on Windows would be nice to have in the commit message).

Yes, and also please see the other numbers I just posted.  It's more of
an update -- we took the current memmem() fall-back from glibc, too, and
they switched to this shiny new implementation to avoid the quadratic
complexity of the old one in the meantime.

I was going to say that with a fast memmem() we could convert some
strstr() calls, especially those where we know the lengths of the
strings anyway -- intuitively, memmem() should be faster than strstr()
in that case.  However, the following patch on top of the Gnulib import
makes "git grep grep v1.6.1" take 10% *more* time for me (on Windows).
I wonder why.


diff --git a/grep.c b/grep.c
index 062b2b6..66ef171 100644
--- a/grep.c
+++ b/grep.c
@@ -39,6 +39,8 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
 {
 	int err;
 
+	p->pattern_len = strlen(p->pattern);
+
 	if (opt->fixed || is_fixed(p->pattern))
 		p->fixed = 1;
 	if (opt->regflags & REG_ICASE)
@@ -268,16 +270,17 @@ static void show_name(struct grep_opt *opt, const char *name)
 	printf("%s%c", name, opt->null_following_name ? '\0' : '\n');
 }
 
-static int fixmatch(const char *pattern, char *line, regmatch_t *match)
+static int fixmatch(const char *pattern, size_t pattern_len, const char *bol,
+		    const char *eol, regmatch_t *match)
 {
-	char *hit = strstr(line, pattern);
+	char *hit = memmem(bol, eol - bol, pattern, pattern_len);
 	if (!hit) {
 		match->rm_so = match->rm_eo = -1;
 		return REG_NOMATCH;
 	}
 	else {
-		match->rm_so = hit - line;
-		match->rm_eo = match->rm_so + strlen(pattern);
+		match->rm_so = hit - bol;
+		match->rm_eo = match->rm_so + pattern_len;
 		return 0;
 	}
 }
@@ -335,7 +338,7 @@ static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol
 			       pmatch, 0);
 	}
 	else {
-		hit = !fixmatch(p->pattern, bol, pmatch);
+		hit = !fixmatch(p->pattern, p->pattern_len, bol, eol, pmatch);
 	}
 
 	if (hit && opt->word_regexp) {
diff --git a/grep.h b/grep.h
index 5102ce3..2e22ab2 100644
--- a/grep.h
+++ b/grep.h
@@ -28,6 +28,7 @@ struct grep_pat {
 	int no;
 	enum grep_pat_token token;
 	const char *pattern;
+	size_t pattern_len;
 	enum grep_header_field field;
 	regex_t regexp;
 	unsigned fixed:1;

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] import memmem() with linear complexity from Gnulib
  2009-03-01 11:15                   ` René Scharfe
@ 2009-03-01 18:55                     ` René Scharfe
  0 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2009-03-01 18:55 UTC (permalink / raw)
  To: Jeff King; +Cc: Mike Hommey, Junio C Hamano, git

René Scharfe schrieb:
> I was going to say that with a fast memmem() we could convert some
> strstr() calls, especially those where we know the lengths of the
> strings anyway -- intuitively, memmem() should be faster than strstr()
> in that case.  However, the following patch on top of the Gnulib import
> makes "git grep grep v1.6.1" take 10% *more* time for me (on Windows).
> I wonder why.

More numbers.  "memmem" is the patch I'm replying to which converts
grep's fixed string search from strstr() to memmem().  "quadratic" means
the current version of compat/memmem.c was used (NO_MEMMEM=yes),
"linear" is the same, but with the newer memmem() from Gnulib imported.
 The numbers are elapsed seconds for the following command, run in a
Linux kernel repo:

   git grep grep v2.6.28 >/dev/null

                                   Ubuntu Fedora
   [1] v1.6.2-rc2                    2.99   3.52
   [2] v1.6.2-rc2+memmem             3.09   3.28
   [3] v1.6.2-rc2+memmem+quadratic   8.42   8.50
   [4] v1.6.2-rc2+memmem+linear      3.17   3.25

So, we should be careful when using memmem() as long as we have the
current implementation in compat/, as the quadratic complexity can
result in a significant slowdown ([3]).

The new memmem() indeed is faster than the new strstr(), as expected
(Fedora [2] and [4] vs. Fedora [1]).  Remember, Fedora 10 already
includes the glibc version with the linear implementations while Ubuntu
8.10 doesn't.

I'm not sure if the difference between the old and new memmem() is
significant or in the noise (Ubuntu [2] and [4]).

In any case, and this surprised me, the fastest of them all is the old
strstr() (Ubuntu [1]).  This is consistent with the slowdown observed on
Windows.

What's even more surprising is the difference between Ubuntu [2] and
[3], which use basically the same memmem().  Or so I thought.  Wrong.
The old memmem() in glibc has a small but effective optimization, that
our version lacks.  I don't remember if I cut it out during the initial
import for increased simplicity or if the version the import was based
on didn't have it.  Anyway, with the following patch, case [3] runs as
fast as case [2] on both Fedora and Ubuntu.

Next step would be to check if pickaxe simply needs this short patch or
really the full-blown linear reimplementation.  I'm off to an
appointment, though, so I'll look into this again tomorrow.

René

diff --git a/compat/memmem.c b/compat/memmem.c
index cd0d877..fed5a77 100644
--- a/compat/memmem.c
+++ b/compat/memmem.c
@@ -5,6 +5,7 @@ void *gitmemmem(const void *haystack, size_t haystack_len,
 {
 	const char *begin = haystack;
 	const char *last_possible = begin + haystack_len - needle_len;
+	char tip;

 	/*
 	 * The first occurrence of the empty string is deemed to occur at
@@ -20,8 +21,10 @@ void *gitmemmem(const void *haystack, size_t haystack_len,
 	if (haystack_len < needle_len)
 		return NULL;

+	tip = *((const char *)needle);
+
 	for (; begin <= last_possible; begin++) {
-		if (!memcmp(begin, needle, needle_len))
+		if (*begin == tip && !memcmp(begin, needle, needle_len))
 			return (void *)begin;
 	}

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-02-28 13:10         ` René Scharfe
  2009-02-28 17:40           ` Junio C Hamano
@ 2009-03-01  7:31           ` Junio C Hamano
  2009-03-01 10:53             ` René Scharfe
  1 sibling, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2009-03-01  7:31 UTC (permalink / raw)
  To: René Scharfe; +Cc: git

René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:

> I get this (Ubuntu 8.10 x64, Fedora 10 x64 using the same Linux repo,
> Windows Vista x64 using a different Linux repo with the same HEAD on
> NTFS and msysgit, numbers are the elapsed time in seconds, best of five
> runs):
>
>                            Ubuntu  Fedora  Windows
>    v1.6.2-rc2                8.14    8.16    9.236
>    v1.6.2-rc2+[1-4]          2.43    2.45    2.995
>    v1.6.2-rc2+[1-4]+memmem   1.31    1.25    2.917
>    v1.6.2-rc2+[1-3]+memmem   1.51    1.16    8.455
>
> Ubuntu has glibc 2.8, while Fedora 10 has glibc 2.9, with a new and more
> efficient memmem() implementation.  On Windows, we use our own naive
> memmem() implementation.

Shoot, I was not being careful enough.

> So using memmem() is worthwhile.  And providing a better fall-back
> version in compat/ can speed up this particular case to the point where
> the fourth patch becomes moot.

Are these numbers telling me that with a good memmem() implementation,
patch 4/4 is not just moot but actively detrimental?

With a long enough needle, it entirely is possible that scanning the whole
image with sublinear string search algorithm would perform much better
than the preprocessing patch 4/4 does which has to scan all the bytes in
the common parts.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/4] diffcore-pickaxe: further refactor count_match()
  2009-03-01  7:31           ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
@ 2009-03-01 10:53             ` René Scharfe
  0 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2009-03-01 10:53 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano schrieb:
> René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:
> 
>> I get this (Ubuntu 8.10 x64, Fedora 10 x64 using the same Linux repo,
>> Windows Vista x64 using a different Linux repo with the same HEAD on
>> NTFS and msysgit, numbers are the elapsed time in seconds, best of five
>> runs):
>>
>>                            Ubuntu  Fedora  Windows
>>    v1.6.2-rc2                8.14    8.16    9.236
>>    v1.6.2-rc2+[1-4]          2.43    2.45    2.995
>>    v1.6.2-rc2+[1-4]+memmem   1.31    1.25    2.917
>>    v1.6.2-rc2+[1-3]+memmem   1.51    1.16    8.455
>>
>> Ubuntu has glibc 2.8, while Fedora 10 has glibc 2.9, with a new and more
>> efficient memmem() implementation.  On Windows, we use our own naive
>> memmem() implementation.

Correction: On Windows, we use the previous, quadratic, naive memmem()
implementation from glibc, not anything we came up with.

>> So using memmem() is worthwhile.  And providing a better fall-back
>> version in compat/ can speed up this particular case to the point where
>> the fourth patch becomes moot.
> 
> Are these numbers telling me that with a good memmem() implementation,
> patch 4/4 is not just moot but actively detrimental?
> 
> With a long enough needle, it entirely is possible that scanning the whole
> image with sublinear string search algorithm would perform much better
> than the preprocessing patch 4/4 does which has to scan all the bytes in
> the common parts.

Yes, patch 4 makes it go slower than using memmem() alone, in this test.

Here are a few more numbers, all measured on Windows.  The topmost four
times in the column "long" should be the same as in the Windows column
above, yet they are slightly bigger.  Some background process must've
decided to do some work.

  git log -S"$STRING" HEAD -- kernel/sched.c >/dev/null

  short: STRING='e'
  long:  STRING='Ensure that the real time constraints are schedulable.'

                                  short  long
  v1.6.2-rc2                      9.120  9.266
  v1.6.2-rc2+[1-4]                3.037  3.048
  v1.6.2-rc2+[1-4]+memmem         2.994  2.964
  v1.6.2-rc2+[1-3]+memmem         8.514  8.528
  v1.6.2-rc2+[1-4]+memmem+linear  1.939  1.759
  v1.6.2-rc2+[1-3]+memmem+linear  2.315  1.559

So patch 4 helps significantly with short needles, while it hurts a bit
with long needles.  Linear memmem() is faster than the quadratic one we
currently have in compat/ in all cases.

René

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 4/4] diffcore-pickaxe: optimize by trimming common initial and trailing parts
  2009-02-26  6:52 [PATCH 0/4] Pickaxe search clean-up and optimization Junio C Hamano
                   ` (2 preceding siblings ...)
  2009-02-26  6:52 ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
@ 2009-02-26  6:52 ` Junio C Hamano
  2009-02-26  9:05   ` Junio C Hamano
  2009-03-02 23:00 ` [PATCH 1/2] diffcore-pickaxe: use memmem() René Scharfe
  4 siblings, 1 reply; 23+ messages in thread
From: Junio C Hamano @ 2009-02-26  6:52 UTC (permalink / raw)
  To: git

We can optimize the pickaxe search by removing the lines at the beginning
and at the end that are common between the preimage and the postimage, and
counting the occurrences of the "needle" in the haystack that is now made
smaller.

One thing that we need to be careful about is that we should not remove
the common part in full.  We need to keep at least the same length as the
"needle" from the common part, so that we can detect the differences in a
case like this:

  preimage: common part then needle appears in this file
 postimage: common part then needlX appears in this file

If we literally remove the common leading part up to "needl" and start
counting from the byte after that, we will miss that preimage has an
occurrence of "needle" but postimage does not.

With this optimization in place, the following query in the Linux kernel
repository on my machine becomes about 40% faster:

$ STRING='Ensure that the real time constraints are schedulable.'
$ git log -S"$STRING" HEAD -- kernel/sched.c >/dev/null

(Before the patch, best of 5 runs)
5.59user 0.15system 0:05.74elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+39956minor)pagefaults 0swaps

(After the patch, best of 5 runs)
3.04user 0.17system 0:03.23elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+49697minor)pagefaults 0swaps

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 diffcore-pickaxe.c |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index 4a0dca1..b447d75 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -68,9 +68,59 @@ static int has_different_matches(struct diff_filepair *p,
 				 const char *needle, unsigned long len,
 				 regex_t *regexp)
 {
-	return (count_match(0, p->one, needle, len, regexp)
-		!= count_match(0, p->two, needle, len, regexp));
+	const char *one, *two;
+	unsigned long one_size, two_size, common, trim;
+	int differs;
 
+	if (regexp)
+		return (count_match(0, p->one, needle, len, regexp)
+			!= count_match(0, p->two, needle, len, regexp));
+	if (diff_populate_filespec(p->one, 0) ||
+	    diff_populate_filespec(p->two, 0))
+		return 0;
+
+	one = p->one->data;
+	two = p->two->data;
+	one_size = p->one->size;
+	two_size = p->two->size;
+	common = (one_size < two_size) ? one_size : two_size;
+
+	/* Skip the initial common part */
+	for (trim = 0;
+	     trim < common && one[trim] == two[trim];
+	     trim++)
+		;
+
+	/* Start comparing at (len-1) bytes before the first difference */
+	if (len <= trim) {
+		trim -= len;
+		one += trim;
+		two += trim;
+		common -= trim;
+		one_size -= trim;
+		two_size -= trim;
+	}
+
+	/* Trim the common part at the end */
+	for (trim = 0;
+	     trim < common && one[one_size - trim - 1] == two[two_size - trim - 1];
+	     trim++)
+		;
+
+	/* Leave (len-1) bytes */
+	if (len <= trim) {
+		trim -= len;
+		one_size -= trim;
+		two_size -= trim;
+	}
+
+	differs = (count_match_internal(0, one, one_size, needle, len, regexp) !=
+		   count_match_internal(0, two, two_size, needle, len, regexp));
+
+	diff_free_filespec_data(p->one);
+	diff_free_filespec_data(p->two);
+
+	return differs;
 }
 
 static int pickaxe_match_one(struct diff_filepair *p,
-- 
1.6.2.rc2.91.gf9a36

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/4] diffcore-pickaxe: optimize by trimming common initial and trailing parts
  2009-02-26  6:52 ` [PATCH 4/4] diffcore-pickaxe: optimize by trimming common initial and trailing parts Junio C Hamano
@ 2009-02-26  9:05   ` Junio C Hamano
  0 siblings, 0 replies; 23+ messages in thread
From: Junio C Hamano @ 2009-02-26  9:05 UTC (permalink / raw)
  To: git

Junio C Hamano <gitster@pobox.com> writes:

> With this optimization in place, the following query in the Linux kernel
> repository on my machine becomes about 40% faster:
>
> $ STRING='Ensure that the real time constraints are schedulable.'
> $ git log -S"$STRING" HEAD -- kernel/sched.c >/dev/null
>
> (Before the patch, best of 5 runs)
> 5.59user 0.15system 0:05.74elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
> 0inputs+0outputs (0major+39956minor)pagefaults 0swaps
>
> (After the patch, best of 5 runs)
> 3.04user 0.17system 0:03.23elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
> 0inputs+0outputs (0major+49697minor)pagefaults 0swaps
    
The file "kernel/sched.c" has roughly 900 changes applied to it, and over
its lifetime, it has grown from 5kB to 9kB in size.

I suspect a larger file might see a bigger performance boost.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/2] diffcore-pickaxe: use memmem()
  2009-02-26  6:52 [PATCH 0/4] Pickaxe search clean-up and optimization Junio C Hamano
                   ` (3 preceding siblings ...)
  2009-02-26  6:52 ` [PATCH 4/4] diffcore-pickaxe: optimize by trimming common initial and trailing parts Junio C Hamano
@ 2009-03-02 23:00 ` René Scharfe
  2009-03-02 23:19   ` [PATCH 2/2] optimize compat/ memmem() René Scharfe
  4 siblings, 1 reply; 23+ messages in thread
From: René Scharfe @ 2009-03-02 23:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Use memmem() instead of open-coding it.  The system libraries usually have a
much faster version than the memcmp()-loop here.  Even our own fall-back in
compat/, which is used on Windows, is slightly faster.

The following commands were run in a Linux kernel repository and timed, the
best of five results is shown:

  $ STRING='Ensure that the real time constraints are schedulable.'
  $ git log -S"$STRING" HEAD -- kernel/sched.c >/dev/null

On Ubuntu 8.10 x64, before (v1.6.2-rc2):

  8.09user 0.04system 0:08.14elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
  0inputs+0outputs (0major+30952minor)pagefaults 0swaps

And with the patch:

  1.50user 0.04system 0:01.54elapsed 100%CPU (0avgtext+0avgdata 0maxresident)k
  0inputs+0outputs (0major+30645minor)pagefaults 0swaps

On Fedora 10 x64, before:

  8.34user 0.05system 0:08.39elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
  0inputs+0outputs (0major+29268minor)pagefaults 0swaps

And with the patch:

  1.15user 0.05system 0:01.20elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
  0inputs+0outputs (0major+32253minor)pagefaults 0swaps

On Windows Vista x64, before:

  real    0m9.204s
  user    0m0.000s
  sys     0m0.000s

And with the patch:

  real    0m8.470s
  user    0m0.000s
  sys     0m0.000s

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 diffcore-pickaxe.c |   18 ++++++++----------
 1 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index af9fffe..574b3e8 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -10,7 +10,7 @@ static unsigned int contains(struct diff_filespec *one,
 			     regex_t *regexp)
 {
 	unsigned int cnt;
-	unsigned long offset, sz;
+	unsigned long sz;
 	const char *data;
 	if (diff_populate_filespec(one, 0))
 		return 0;
@@ -33,15 +33,13 @@ static unsigned int contains(struct diff_filespec *one,
 		}
 
 	} else { /* Classic exact string match */
-		/* Yes, I've heard of strstr(), but the thing is *data may
-		 * not be NUL terminated.  Sue me.
-		 */
-		for (offset = 0; offset + len <= sz; offset++) {
-			/* we count non-overlapping occurrences of needle */
-			if (!memcmp(needle, data + offset, len)) {
-				offset += len - 1;
-				cnt++;
-			}
+		while (sz) {
+			const char *found = memmem(data, sz, needle, len);
+			if (!found)
+				break;
+			sz -= found - data + len;
+			data = found + len;
+			cnt++;
 		}
 	}
 	diff_free_filespec_data(one);
-- 
1.6.2.rc2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 2/2] optimize compat/ memmem()
  2009-03-02 23:00 ` [PATCH 1/2] diffcore-pickaxe: use memmem() René Scharfe
@ 2009-03-02 23:19   ` René Scharfe
  0 siblings, 0 replies; 23+ messages in thread
From: René Scharfe @ 2009-03-02 23:19 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

When memmem() was imported from glibc 2.2 into compat/, an optimization
was dropped in the process, in order to make the code smaller and simpler.
It was OK because memmem() wasn't used in performance-critical code.  Now
the situation has changed and we can benefit from this optimization.

The trick is to avoid calling memcmp() if the first character of the needle
already doesn't match.  Checking one character directly is much cheaper
than the function call overhead.  We keep the first character of the needle
in the variable named point and the rest in the one named tail.

The following commands were run in a Linux kernel repository and timed, the
best of five results is shown:

  $ STRING='Ensure that the real time constraints are schedulable.'
  $ git log -S"$STRING" HEAD -- kernel/sched.c >/dev/null

On Windows Vista x64, before:

  real    0m8.470s
  user    0m0.000s
  sys     0m0.000s

And after the patch:

  real    0m1.887s
  user    0m0.000s
  sys     0m0.000s

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
This performance fix is a good idea, even if we later decide to import
the latest version of memmem() from Gnulib, I think.  And I'm not so
sure any more that we want to do that in the first place.  More
measurements are needed (which goes for these two patches, too, of
course), but this version should provide a better baseline.

 compat/memmem.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/compat/memmem.c b/compat/memmem.c
index cd0d877..56bcb42 100644
--- a/compat/memmem.c
+++ b/compat/memmem.c
@@ -5,6 +5,8 @@ void *gitmemmem(const void *haystack, size_t haystack_len,
 {
 	const char *begin = haystack;
 	const char *last_possible = begin + haystack_len - needle_len;
+	const char *tail = needle;
+	char point;
 
 	/*
 	 * The first occurrence of the empty string is deemed to occur at
@@ -20,8 +22,9 @@ void *gitmemmem(const void *haystack, size_t haystack_len,
 	if (haystack_len < needle_len)
 		return NULL;
 
+	point = *tail++;
 	for (; begin <= last_possible; begin++) {
-		if (!memcmp(begin, needle, needle_len))
+		if (*begin == point && !memcmp(begin + 1, tail, needle_len - 1))
 			return (void *)begin;
 	}
 
-- 
1.6.2.rc2

^ permalink raw reply related	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2009-03-02 23:21 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-02-26  6:52 [PATCH 0/4] Pickaxe search clean-up and optimization Junio C Hamano
2009-02-26  6:52 ` [PATCH 1/4] diffcore-pickaxe: refactor diffcore_pickaxe() Junio C Hamano
2009-02-27 23:58   ` René Scharfe
2009-02-26  6:52 ` [PATCH 2/4] diffcore-pickaxe: micro-optimize has_match() function Junio C Hamano
2009-02-26  6:52 ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
2009-02-26  7:23   ` Kjetil Barvik
2009-02-28  1:13   ` René Scharfe
2009-02-28  1:25     ` Junio C Hamano
2009-02-28  6:08       ` Junio C Hamano
2009-02-28 13:10         ` René Scharfe
2009-02-28 17:40           ` Junio C Hamano
2009-02-28 18:15             ` René Scharfe
2009-02-28 19:16             ` [PATCH] import memmem() with linear complexity from Gnulib René Scharfe
2009-02-28 22:44               ` Mike Hommey
2009-03-01  3:41                 ` Jeff King
2009-03-01 11:15                   ` René Scharfe
2009-03-01 18:55                     ` René Scharfe
2009-03-01  7:31           ` [PATCH 3/4] diffcore-pickaxe: further refactor count_match() Junio C Hamano
2009-03-01 10:53             ` René Scharfe
2009-02-26  6:52 ` [PATCH 4/4] diffcore-pickaxe: optimize by trimming common initial and trailing parts Junio C Hamano
2009-02-26  9:05   ` Junio C Hamano
2009-03-02 23:00 ` [PATCH 1/2] diffcore-pickaxe: use memmem() René Scharfe
2009-03-02 23:19   ` [PATCH 2/2] optimize compat/ memmem() René Scharfe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).