git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 1/2] refs.c: optimize check_refname_component()
@ 2014-06-01  5:17 David Turner
  2014-06-01  5:17 ` [PATCH v4 2/2] refs.c: SSE4.2 optimizations for check_refname_component David Turner
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: David Turner @ 2014-06-01  5:17 UTC (permalink / raw)
  To: Michael Haggerty, Junio C Hamano, git; +Cc: David Turner

In a repository with many refs, check_refname_component can be a major
contributor to the runtime of some git commands. One such command is

git rev-parse HEAD

Timings for one particular repo, with about 60k refs, almost all
packed, are:

Old: 35 ms
New: 29 ms

Many other commands which read refs are also sped up.

Signed-off-by: David Turner <dturner@twitter.com>
---
 refs.c | 68 ++++++++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/refs.c b/refs.c
index 28d5eca..62e2301 100644
--- a/refs.c
+++ b/refs.c
@@ -5,9 +5,32 @@
 #include "dir.h"
 #include "string-list.h"
 
+/* How to handle various characters in refnames:
+ * 0: An acceptable character for refs
+ * 1: End-of-component
+ * 2: ., look for a following . to reject .. in refs
+ * 3: @, look for a following { to reject @{ in refs
+ * 9: A bad character, reject ref
+ *
+ * See below for the list of illegal characters, from which
+ * this table is derived.
+ */
+static unsigned char refname_disposition[] = {
+	1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+	9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+	9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 2, 1,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 9, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
+};
+
 /*
- * Make sure "ref" is something reasonable to have under ".git/refs/";
- * We do not like it if:
+ * Try to read one refname component from the front of refname.
+ * Return the length of the component found, or -1 if the component is
+ * not legal.  It is legal if it is something reasonable to have under
+ * ".git/refs/"; We do not like it if:
  *
  * - any path component of it begins with ".", or
  * - it has double dots "..", or
@@ -15,24 +38,7 @@
  * - it ends with a "/".
  * - it ends with ".lock"
  * - it contains a "\" (backslash)
- */
 
-/* Return true iff ch is not allowed in reference names. */
-static inline int bad_ref_char(int ch)
-{
-	if (((unsigned) ch) <= ' ' || ch == 0x7f ||
-	    ch == '~' || ch == '^' || ch == ':' || ch == '\\')
-		return 1;
-	/* 2.13 Pattern Matching Notation */
-	if (ch == '*' || ch == '?' || ch == '[') /* Unsupported */
-		return 1;
-	return 0;
-}
-
-/*
- * Try to read one refname component from the front of refname.  Return
- * the length of the component found, or -1 if the component is not
- * legal.
  */
 static int check_refname_component(const char *refname, int flags)
 {
@@ -40,17 +46,25 @@ static int check_refname_component(const char *refname, int flags)
 	char last = '\0';
 
 	for (cp = refname; ; cp++) {
-		char ch = *cp;
-		if (ch == '\0' || ch == '/')
+		unsigned char ch = (unsigned char) *cp;
+		char disp = refname_disposition[ch];
+		switch(disp) {
+		case 1:
+			goto out;
+		case 2:
+			if (last == '.')
+				return -1; /* Refname contains "..". */
 			break;
-		if (bad_ref_char(ch))
-			return -1; /* Illegal character in refname. */
-		if (last == '.' && ch == '.')
-			return -1; /* Refname contains "..". */
-		if (last == '@' && ch == '{')
-			return -1; /* Refname contains "@{". */
+		case 3:
+			if (last == '@')
+				return -1; /* Refname contains "@{". */
+			break;
+		case 9:
+			return -1;
+		}
 		last = ch;
 	}
+out:
 	if (cp == refname)
 		return 0; /* Component has zero length. */
 	if (refname[0] == '.') {
-- 
2.0.0.rc1.18.gf763c0f

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v4 2/2] refs.c: SSE4.2 optimizations for check_refname_component
  2014-06-01  5:17 [PATCH v4 1/2] refs.c: optimize check_refname_component() David Turner
@ 2014-06-01  5:17 ` David Turner
  2014-06-01  7:17 ` [PATCH v4 1/2] refs.c: optimize check_refname_component() Andreas Schwab
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: David Turner @ 2014-06-01  5:17 UTC (permalink / raw)
  To: Michael Haggerty, Junio C Hamano, git; +Cc: David Turner

Optimize check_refname_component using SSE4.2, where available.

git rev-parse HEAD is a good test-case for this, since it does almost
nothing except parse refs.  For one particular repo with about 60k
refs, almost all packed, the timings are:

Look up table: 29 ms
SSE4.2:        25 ms

This is about a 15% improvement.

The configure.ac changes include code from the GNU C Library written
by Joseph S. Myers <joseph at codesourcery dot com>.

Signed-off-by: David Turner <dturner@twitter.com>
---
 Makefile           |   6 +++
 aclocal.m4         |   6 +++
 configure.ac       |  17 ++++++++
 git-compat-util.h  |  20 +++++++++
 refs.c             | 116 +++++++++++++++++++++++++++++++++++++++++++++--------
 t/t5511-refspec.sh |  13 ++++++
 6 files changed, 161 insertions(+), 17 deletions(-)

diff --git a/Makefile b/Makefile
index a53f3a8..dd2127a 100644
--- a/Makefile
+++ b/Makefile
@@ -1326,6 +1326,11 @@ else
 		COMPAT_OBJS += compat/win32mmap.o
 	endif
 endif
+ifdef NO_SSE42
+	BASIC_CFLAGS += -DNO_SSE42
+else
+	BASIC_CFLAGS += -msse4.2
+endif
 ifdef OBJECT_CREATION_USES_RENAMES
 	COMPAT_CFLAGS += -DOBJECT_CREATION_MODE=1
 endif
@@ -2199,6 +2204,7 @@ GIT-BUILD-OPTIONS: FORCE
 	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
 	@echo NO_PYTHON=\''$(subst ','\'',$(subst ','\'',$(NO_PYTHON)))'\' >>$@
 	@echo NO_UNIX_SOCKETS=\''$(subst ','\'',$(subst ','\'',$(NO_UNIX_SOCKETS)))'\' >>$@
+	@echo NO_SSE42=\''$(subst ','\'',$(subst ','\'',$(NO_SSE42)))'\' >>$@
 ifdef TEST_OUTPUT_DIRECTORY
 	@echo TEST_OUTPUT_DIRECTORY=\''$(subst ','\'',$(subst ','\'',$(TEST_OUTPUT_DIRECTORY)))'\' >>$@
 endif
diff --git a/aclocal.m4 b/aclocal.m4
index f11bc7e..d9f3f19 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -38,3 +38,9 @@ AC_DEFUN([TYPE_SOCKLEN_T],
       [#include <sys/types.h>
 #include <sys/socket.h>])
 ])
+
+dnl Test a compiler option or options with an empty input file.
+dnl LIBC_TRY_CC_OPTION([options], [action-if-true], [action-if-false])
+AC_DEFUN([LIBC_TRY_CC_OPTION],
+[AS_IF([AC_TRY_COMMAND([${CC-cc} $1 -xc /dev/null -S -o /dev/null])],
+	[$2], [$3])])
diff --git a/configure.ac b/configure.ac
index b711254..3a5bda9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -382,6 +382,23 @@ AS_HELP_STRING([],[Tcl/Tk interpreter will be found in a system.]),
 GIT_PARSE_WITH(tcltk))
 #
 
+# Declare the with-sse42/without-sse42 options.
+AC_ARG_WITH(sse42,
+AS_HELP_STRING([--with-sse42],[use SSE4.2 instructions])
+AS_HELP_STRING([],[(default is YES if your compiler supports -msse4.2)]),
+GIT_PARSE_WITH(sse42))
+
+if test "$NO_SSE42" != "YesPlease"; then
+   dnl Check if -msse4.2 works.
+   AC_CACHE_CHECK(for SSE4.2 support, cc_cv_sse42, [dnl
+   LIBC_TRY_CC_OPTION([-msse4.2], [cc_cv_sse42=yes], [cc_cv_sse42=no])
+   ])
+   if test $cc_cv_sse42 = no; then
+     NO_SSE42=1
+   fi
+fi
+
+GIT_CONF_SUBST([NO_SSE42])
 
 ## Checks for programs.
 AC_MSG_NOTICE([CHECKS for programs])
diff --git a/git-compat-util.h b/git-compat-util.h
index f6d3a46..254487a 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -668,6 +668,26 @@ void git_qsort(void *base, size_t nmemb, size_t size,
 #endif
 #endif
 
+#ifndef NO_SSE42
+#include <nmmintrin.h>
+/* Clang ships with a version of nmmintrin.h that's incomplete; if
+ * necessary, we define the constants that we're going to use. */
+#ifndef _SIDD_UBYTE_OPS
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#endif
+
+/* This is the system memory page size; it's used so that we can read
+ * outside the bounds of an allocation without segfaulting. It is
+ * assumed to be a power of 2. */
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+#endif
+
 #ifdef UNRELIABLE_FSTAT
 #define fstat_is_reliable() 0
 #else
diff --git a/refs.c b/refs.c
index 62e2301..ed436ea 100644
--- a/refs.c
+++ b/refs.c
@@ -26,6 +26,25 @@ static unsigned char refname_disposition[] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
 };
 
+static int check_refname_component_trailer(const char *cp, const char *refname, int flags)
+{
+	if (cp == refname)
+		return 0; /* Component has zero length. */
+	if (refname[0] == '.') {
+		if (!(flags & REFNAME_DOT_COMPONENT))
+			return -1; /* Component starts with '.'. */
+		/*
+		 * Even if leading dots are allowed, don't allow "."
+		 * as a component (".." is prevented by a rule above).
+		 */
+		if (refname[1] == '\0')
+			return -1; /* Component equals ".". */
+	}
+	if (cp - refname >= 5 && !memcmp(cp - 5, ".lock", 5))
+		return -1; /* Refname ends with ".lock". */
+	return cp - refname;
+}
+
 /*
  * Try to read one refname component from the front of refname.
  * Return the length of the component found, or -1 if the component is
@@ -40,7 +59,7 @@ static unsigned char refname_disposition[] = {
  * - it contains a "\" (backslash)
 
  */
-static int check_refname_component(const char *refname, int flags)
+static int check_refname_component_1(const char *refname, int flags)
 {
 	const char *cp;
 	char last = '\0';
@@ -50,7 +69,7 @@ static int check_refname_component(const char *refname, int flags)
 		char disp = refname_disposition[ch];
 		switch(disp) {
 		case 1:
-			goto out;
+			return check_refname_component_trailer(cp, refname, flags);
 		case 2:
 			if (last == '.')
 				return -1; /* Refname contains "..". */
@@ -64,23 +83,86 @@ static int check_refname_component(const char *refname, int flags)
 		}
 		last = ch;
 	}
-out:
-	if (cp == refname)
-		return 0; /* Component has zero length. */
-	if (refname[0] == '.') {
-		if (!(flags & REFNAME_DOT_COMPONENT))
-			return -1; /* Component starts with '.'. */
-		/*
-		 * Even if leading dots are allowed, don't allow "."
-		 * as a component (".." is prevented by a rule above).
-		 */
-		if (refname[1] == '\0')
-			return -1; /* Component equals ".". */
+}
+
+#ifdef NO_SSE42
+#define check_refname_component check_refname_component_1
+#else
+#define BLOCK_SIZE 16
+
+/* Vectorized version of check_refname_component */
+static int check_refname_component(const char *refname, int flags)
+{
+	const __m128i *refname_vec = (__m128i*) refname;
+
+	/* Character ranges for characters forbidden in refs; see
+	 * above */
+	static const __v16qi bad = {
+		0x01, 0x20,  0x7e, 0x7f,  0x5e, 0x5e,  0x3a, 0x3a,
+		0x5b, 0x5c,  0x2a, 0x2a,  0x3f, 0x3f,  0x3f, 0x3f};
+
+	static const __v16qi nonslashes = {
+		'\001', '/' -1, '/' + 1, 0xff,
+	};
+
+	static const __v16qi dotdot = {'.','.',0};
+	static const __v16qi atcurly = {'@','{',0};
+
+	const __m128i *vp;
+	const char *cp = (const char *)refname_vec;
+
+	int dotdotpos = BLOCK_SIZE, atcurlypos = BLOCK_SIZE;
+	for (vp = refname_vec; ; vp++) {
+		__m128i tmp;
+		int endpos;
+
+		/* Handle case of forbidden substrings .. and @{ crossing
+		 * sixteen-byte boundaries */
+		if (dotdotpos == 15 && *cp == '.')
+			return -1;
+
+		if (atcurlypos == 15 && *cp == '{')
+			return -1;
+
+		if (((uintptr_t) vp & (PAGE_SIZE - 1)) > PAGE_SIZE - BLOCK_SIZE)
+			/* End-of-page; fall back to slow method for
+			 * this entire component. */
+			return check_refname_component_1(refname, flags);
+
+		tmp = _mm_lddqu_si128(vp);
+
+		/* Find slashes or end-of-string. The double-negative
+		 * (negative-polarity search for non-slashes) is
+		 * necessary so that \0 will also be counted.  */
+		endpos = _mm_cmpistri((__m128i) nonslashes, tmp,
+				      _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES |
+				      _SIDD_NEGATIVE_POLARITY);
+
+		if (_mm_cmpestrc((__m128i) bad, BLOCK_SIZE, tmp, endpos,
+				 _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES))
+			return -1;
+
+		dotdotpos = _mm_cmpestri((__m128i) dotdot, 2, tmp, endpos,
+					 _SIDD_UBYTE_OPS |
+					 _SIDD_CMP_EQUAL_ORDERED);
+		if (dotdotpos < 15)
+			return -1;
+
+		atcurlypos = _mm_cmpestri((__m128i) atcurly, 2, tmp, endpos,
+					  _SIDD_UBYTE_OPS |
+					  _SIDD_CMP_EQUAL_ORDERED);
+		if (atcurlypos < 15)
+			return -1;
+
+		if (endpos < BLOCK_SIZE) {
+			cp = ((const char*) vp) + endpos;
+			break;
+		}
+		cp = (const char*) vp + BLOCK_SIZE;
 	}
-	if (cp - refname >= 5 && !memcmp(cp - 5, ".lock", 5))
-		return -1; /* Refname ends with ".lock". */
-	return cp - refname;
+	return check_refname_component_trailer(cp, refname, flags);
 }
+#endif
 
 int check_refname_format(const char *refname, int flags)
 {
diff --git a/t/t5511-refspec.sh b/t/t5511-refspec.sh
index c289322..0f03f9c 100755
--- a/t/t5511-refspec.sh
+++ b/t/t5511-refspec.sh
@@ -84,4 +84,17 @@ test_refspec push 'refs/heads/*/*/for-linus:refs/remotes/mine/*' invalid
 test_refspec fetch 'refs/heads/*/for-linus:refs/remotes/mine/*'
 test_refspec push 'refs/heads/*/for-linus:refs/remotes/mine/*'
 
+test_refspec fetch 'refs/heads/a-very-long-refname'
+test_refspec fetch 'refs/heads/.a-very-long-refname'		invalid
+test_refspec fetch 'refs/heads/abcdefgh0123..'			invalid
+test_refspec fetch 'refs/heads/abcdefgh01234..'			invalid
+test_refspec fetch 'refs/heads/abcdefgh012345..'		invalid
+test_refspec fetch 'refs/heads/abcdefgh0123456..'		invalid
+test_refspec fetch 'refs/heads/abcdefgh01234567..'		invalid
+test_refspec fetch 'refs/heads/abcdefgh0123.a'
+test_refspec fetch 'refs/heads/abcdefgh01234.a'
+test_refspec fetch 'refs/heads/abcdefgh012345.a'
+test_refspec fetch 'refs/heads/abcdefgh0123456.a'
+test_refspec fetch 'refs/heads/abcdefgh01234567.a'
+
 test_done
-- 
2.0.0.rc1.18.gf763c0f

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 1/2] refs.c: optimize check_refname_component()
  2014-06-01  5:17 [PATCH v4 1/2] refs.c: optimize check_refname_component() David Turner
  2014-06-01  5:17 ` [PATCH v4 2/2] refs.c: SSE4.2 optimizations for check_refname_component David Turner
@ 2014-06-01  7:17 ` Andreas Schwab
  2014-06-01 19:43 ` Philip Oakley
  2014-06-01 20:50 ` Michael Haggerty
  3 siblings, 0 replies; 5+ messages in thread
From: Andreas Schwab @ 2014-06-01  7:17 UTC (permalink / raw)
  To: David Turner; +Cc: Michael Haggerty, Junio C Hamano, git, David Turner

David Turner <dturner@twopensource.com> writes:

> +static unsigned char refname_disposition[] = {
> +	1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> +	9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> +	9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 2, 1,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 9, 0,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
> +};

> +		unsigned char ch = (unsigned char) *cp;
> +		char disp = refname_disposition[ch];

Undefined behaviour.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 1/2] refs.c: optimize check_refname_component()
  2014-06-01  5:17 [PATCH v4 1/2] refs.c: optimize check_refname_component() David Turner
  2014-06-01  5:17 ` [PATCH v4 2/2] refs.c: SSE4.2 optimizations for check_refname_component David Turner
  2014-06-01  7:17 ` [PATCH v4 1/2] refs.c: optimize check_refname_component() Andreas Schwab
@ 2014-06-01 19:43 ` Philip Oakley
  2014-06-01 20:50 ` Michael Haggerty
  3 siblings, 0 replies; 5+ messages in thread
From: Philip Oakley @ 2014-06-01 19:43 UTC (permalink / raw)
  To: David Turner, Michael Haggerty, Junio C Hamano, git; +Cc: David Turner

From: "David Turner" <dturner@twopensource.com>
> In a repository with many refs, check_refname_component can be a major
> contributor to the runtime of some git commands. One such command is
>
> git rev-parse HEAD
>
> Timings for one particular repo, with about 60k refs, almost all
> packed, are:
>
> Old: 35 ms
> New: 29 ms
>
> Many other commands which read refs are also sped up.
>
> Signed-off-by: David Turner <dturner@twitter.com>
> ---
> refs.c | 68 
> ++++++++++++++++++++++++++++++++++++++++--------------------------
> 1 file changed, 41 insertions(+), 27 deletions(-)
>
> diff --git a/refs.c b/refs.c
> index 28d5eca..62e2301 100644
> --- a/refs.c
> +++ b/refs.c
> @@ -5,9 +5,32 @@
> #include "dir.h"
> #include "string-list.h"
>
> +/* How to handle various characters in refnames:
> + * 0: An acceptable character for refs
> + * 1: End-of-component
> + * 2: ., look for a following . to reject .. in refs
> + * 3: @, look for a following { to reject @{ in refs
> + * 9: A bad character, reject ref
> + *
> + * See below for the list of illegal characters, from which
> + * this table is derived.

The "see below" doesn't quite scan right.  Perhaps
    The character handling rules above are used to
    derive the table below.

> + */
> +static unsigned char refname_disposition[] = {
> + 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 2, 1,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 9, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
> +};
> +
> /*
> - * Make sure "ref" is something reasonable to have under 
> ".git/refs/";
> - * We do not like it if:
> + * Try to read one refname component from the front of refname.
> + * Return the length of the component found, or -1 if the component 
> is
> + * not legal.  It is legal if it is something reasonable to have 
> under
> + * ".git/refs/"; We do not like it if:
>  *
>  * - any path component of it begins with ".", or
>  * - it has double dots "..", or
> @@ -15,24 +38,7 @@
>  * - it ends with a "/".
>  * - it ends with ".lock"
>  * - it contains a "\" (backslash)
> - */
>
> -/* Return true iff ch is not allowed in reference names. */
> -static inline int bad_ref_char(int ch)
> -{
> - if (((unsigned) ch) <= ' ' || ch == 0x7f ||
> -     ch == '~' || ch == '^' || ch == ':' || ch == '\\')
> - return 1;
> - /* 2.13 Pattern Matching Notation */
> - if (ch == '*' || ch == '?' || ch == '[') /* Unsupported */
> - return 1;
> - return 0;
> -}
> -
> -/*
> - * Try to read one refname component from the front of refname. 
> Return
> - * the length of the component found, or -1 if the component is not
> - * legal.
>  */
> static int check_refname_component(const char *refname, int flags)
> {
> @@ -40,17 +46,25 @@ static int check_refname_component(const char 
> *refname, int flags)
>  char last = '\0';
>
>  for (cp = refname; ; cp++) {
> - char ch = *cp;
> - if (ch == '\0' || ch == '/')
> + unsigned char ch = (unsigned char) *cp;
> + char disp = refname_disposition[ch];
> + switch(disp) {
> + case 1:
> + goto out;
> + case 2:
> + if (last == '.')
> + return -1; /* Refname contains "..". */
>  break;
> - if (bad_ref_char(ch))
> - return -1; /* Illegal character in refname. */
> - if (last == '.' && ch == '.')
> - return -1; /* Refname contains "..". */
> - if (last == '@' && ch == '{')
> - return -1; /* Refname contains "@{". */
> + case 3:
> + if (last == '@')
> + return -1; /* Refname contains "@{". */
> + break;
> + case 9:
> + return -1;
> + }
>  last = ch;
>  }
> +out:
>  if (cp == refname)
>  return 0; /* Component has zero length. */
>  if (refname[0] == '.') {
> -- 
> 2.0.0.rc1.18.gf763c0f
>
> --

Philip 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 1/2] refs.c: optimize check_refname_component()
  2014-06-01  5:17 [PATCH v4 1/2] refs.c: optimize check_refname_component() David Turner
                   ` (2 preceding siblings ...)
  2014-06-01 19:43 ` Philip Oakley
@ 2014-06-01 20:50 ` Michael Haggerty
  3 siblings, 0 replies; 5+ messages in thread
From: Michael Haggerty @ 2014-06-01 20:50 UTC (permalink / raw)
  To: David Turner, Junio C Hamano, git; +Cc: David Turner

Thanks for splitting this up into two patches.  See my comments below.

On 06/01/2014 07:17 AM, David Turner wrote:
> In a repository with many refs, check_refname_component can be a major
> contributor to the runtime of some git commands. One such command is
> 
> git rev-parse HEAD
> 
> Timings for one particular repo, with about 60k refs, almost all
> packed, are:
> 
> Old: 35 ms
> New: 29 ms
> 
> Many other commands which read refs are also sped up.
> 
> Signed-off-by: David Turner <dturner@twitter.com>
> ---
>  refs.c | 68 ++++++++++++++++++++++++++++++++++++++++--------------------------
>  1 file changed, 41 insertions(+), 27 deletions(-)
> 
> diff --git a/refs.c b/refs.c
> index 28d5eca..62e2301 100644
> --- a/refs.c
> +++ b/refs.c
> @@ -5,9 +5,32 @@
>  #include "dir.h"
>  #include "string-list.h"
>  
> +/* How to handle various characters in refnames:

We format multiline comments with no text on the opening line:

/*
 * How to handle...
 * ... is derived.
 */

> + * 0: An acceptable character for refs
> + * 1: End-of-component
> + * 2: ., look for a following . to reject .. in refs
> + * 3: @, look for a following { to reject @{ in refs
> + * 9: A bad character, reject ref

This explanation does not agree with the code (or I'm not reading it
correctly).  For example, the character with disposition 3 is '{', not
'@', so ISTM the comment should be

     * 3: {, look for a preceding @ to reject @{ in refnames

BTW, is there a special reason that the values in your table jump from 3
to 9?  I imagine that compilers would use a jump table to implement the
"switch" statement where these values are used, and they might have a
slightly easier time if there are no "holes" between the legal values.

> + *
> + * See below for the list of illegal characters, from which
> + * this table is derived.
> + */
> +static unsigned char refname_disposition[] = {

I think you need to define the length explicitly to 256 here to cause
the entries for 0x80..0xff to be created and initialized to zeros.

The fact that you didn't notice this bug suggests that there might be no
tests involving refnames with non-ASCII characters, which would be
another nice thing to remedy while you are working in this area.

> +	1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> +	9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> +	9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 2, 1,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 9, 0,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
> +};
> +
>  /*
> - * Make sure "ref" is something reasonable to have under ".git/refs/";
> - * We do not like it if:
> + * Try to read one refname component from the front of refname.
> + * Return the length of the component found, or -1 if the component is
> + * not legal.  It is legal if it is something reasonable to have under
> + * ".git/refs/"; We do not like it if:
>   *
>   * - any path component of it begins with ".", or
>   * - it has double dots "..", or
> @@ -15,24 +38,7 @@
>   * - it ends with a "/".
>   * - it ends with ".lock"
>   * - it contains a "\" (backslash)
> - */
>  

^^^ doesn't this leave a blank line inside the comment?

> -/* Return true iff ch is not allowed in reference names. */
> -static inline int bad_ref_char(int ch)
> -{
> -	if (((unsigned) ch) <= ' ' || ch == 0x7f ||
> -	    ch == '~' || ch == '^' || ch == ':' || ch == '\\')
> -		return 1;
> -	/* 2.13 Pattern Matching Notation */
> -	if (ch == '*' || ch == '?' || ch == '[') /* Unsupported */
> -		return 1;
> -	return 0;
> -}
> -
> -/*
> - * Try to read one refname component from the front of refname.  Return
> - * the length of the component found, or -1 if the component is not
> - * legal.
>   */
> [...]

Michael

-- 
Michael Haggerty
mhagger@alum.mit.edu

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-06-01 20:50 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-06-01  5:17 [PATCH v4 1/2] refs.c: optimize check_refname_component() David Turner
2014-06-01  5:17 ` [PATCH v4 2/2] refs.c: SSE4.2 optimizations for check_refname_component David Turner
2014-06-01  7:17 ` [PATCH v4 1/2] refs.c: optimize check_refname_component() Andreas Schwab
2014-06-01 19:43 ` Philip Oakley
2014-06-01 20:50 ` Michael Haggerty

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).