Git development

Git development
 help / color / mirror / Atom feed

* [PATCH v2 2/3] grep: enable threading with -p and -W using lazy attribute lookup
From: Thomas Rast @ 2011-12-02 13:07 UTC (permalink / raw)
  To: René Scharfe; +Cc: Eric Herman, git, Junio C Hamano
In-Reply-To: <cover.1322830368.git.trast@student.ethz.ch>

Lazily load the userdiff attributes in match_funcname().  Use a
separate mutex around this loading to protect the (not thread-safe)
attributes machinery.  This lets us re-enable threading with -p and
-W while reducing the overhead caused by looking up attributes.

Signed-off-by: Thomas Rast <trast@student.ethz.ch>
---
 builtin/grep.c |   10 +++++++-
 grep.c         |   74 ++++++++++++++++++++++++++++++++++----------------------
 grep.h         |    7 +++++
 3 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index 988ea1d..65b1ffe 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -256,6 +256,7 @@ static void start_threads(struct grep_opt *opt)
 
 	pthread_mutex_init(&grep_mutex, NULL);
 	pthread_mutex_init(&read_sha1_mutex, NULL);
+	pthread_mutex_init(&grep_attr_mutex, NULL);
 	pthread_cond_init(&cond_add, NULL);
 	pthread_cond_init(&cond_write, NULL);
 	pthread_cond_init(&cond_result, NULL);
@@ -303,6 +304,7 @@ static int wait_all(void)
 
 	pthread_mutex_destroy(&grep_mutex);
 	pthread_mutex_destroy(&read_sha1_mutex);
+	pthread_mutex_destroy(&grep_attr_mutex);
 	pthread_cond_destroy(&cond_add);
 	pthread_cond_destroy(&cond_write);
 	pthread_cond_destroy(&cond_result);
@@ -1002,9 +1004,15 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
 		opt.regflags |= REG_ICASE;
 
 #ifndef NO_PTHREADS
-	if (online_cpus() == 1 || !grep_threads_ok(&opt))
+	if (online_cpus() == 1)
 		use_threads = 0;
+#else
+	use_threads = 0;
+#endif
 
+	opt.use_threads = use_threads;
+
+#ifndef NO_PTHREADS
 	if (use_threads) {
 		if (opt.pre_context || opt.post_context || opt.file_break ||
 		    opt.funcbody)
diff --git a/grep.c b/grep.c
index 7a070e9..4dd7da2 100644
--- a/grep.c
+++ b/grep.c
@@ -2,6 +2,7 @@
 #include "grep.h"
 #include "userdiff.h"
 #include "xdiff-interface.h"
+#include "thread-utils.h"
 
 void append_header_grep_pattern(struct grep_opt *opt, enum grep_header_field field, const char *pat)
 {
@@ -806,10 +807,46 @@ static void show_line(struct grep_opt *opt, char *bol, char *eol,
 	opt->output(opt, "\n", 1);
 }
 
-static int match_funcname(struct grep_opt *opt, char *bol, char *eol)
+#ifndef NO_PTHREADS
+/*
+ * This lock protects access to the gitattributes machinery, which is
+ * not thread-safe.
+ */
+pthread_mutex_t grep_attr_mutex;
+
+static inline void grep_attr_lock(struct grep_opt *opt)
+{
+	if (opt->use_threads)
+		pthread_mutex_lock(&grep_attr_mutex);
+}
+
+static inline void grep_attr_unlock(struct grep_opt *opt)
+{
+	if (opt->use_threads)
+		pthread_mutex_unlock(&grep_attr_mutex);
+}
+#else
+#define grep_attr_lock(opt)
+#define grep_attr_unlock(opt)
+#endif
+
+static int match_funcname(struct grep_opt *opt, const char *name, char *bol, char *eol)
 {
 	xdemitconf_t *xecfg = opt->priv;
-	if (xecfg && xecfg->find_func) {
+	if (xecfg && !xecfg->find_func) {
+		struct userdiff_driver *drv;
+		grep_attr_lock(opt);
+		drv = userdiff_find_by_path(name);
+		grep_attr_unlock(opt);
+		if (drv && drv->funcname.pattern) {
+			const struct userdiff_funcname *pe = &drv->funcname;
+			xdiff_set_find_func(xecfg, pe->pattern, pe->cflags);
+		} else {
+			xecfg = opt->priv = NULL;
+		}
+	}
+
+	if (xecfg) {
 		char buf[1];
 		return xecfg->find_func(bol, eol - bol, buf, 1,
 					xecfg->find_func_priv) >= 0;
@@ -835,7 +872,7 @@ static void show_funcname_line(struct grep_opt *opt, const char *name,
 		if (lno <= opt->last_shown)
 			break;
 
-		if (match_funcname(opt, bol, eol)) {
+		if (match_funcname(opt, name, bol, eol)) {
 			show_line(opt, bol, eol, name, lno, '=');
 			break;
 		}
@@ -848,7 +885,7 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf,
 	unsigned cur = lno, from = 1, funcname_lno = 0;
 	int funcname_needed = !!opt->funcname;
 
-	if (opt->funcbody && !match_funcname(opt, bol, end))
+	if (opt->funcbody && !match_funcname(opt, name, bol, end))
 		funcname_needed = 2;
 
 	if (opt->pre_context < lno)
@@ -864,7 +901,7 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf,
 		while (bol > buf && bol[-1] != '\n')
 			bol--;
 		cur--;
-		if (funcname_needed && match_funcname(opt, bol, eol)) {
+		if (funcname_needed && match_funcname(opt, name, bol, eol)) {
 			funcname_lno = cur;
 			funcname_needed = 0;
 		}
@@ -942,19 +979,6 @@ static int look_ahead(struct grep_opt *opt,
 	return 0;
 }
 
-int grep_threads_ok(const struct grep_opt *opt)
-{
-	/* If this condition is true, then we may use the attribute
-	 * machinery in grep_buffer_1. The attribute code is not
-	 * thread safe, so we disable the use of threads.
-	 */
-	if ((opt->funcname || opt->funcbody)
-	    && !opt->unmatch_name_only && !opt->status_only && !opt->name_only)
-		return 0;
-
-	return 1;
-}
-
 static void std_output(struct grep_opt *opt, const void *buf, size_t size)
 {
 	fwrite(buf, size, 1, stdout);
@@ -1008,16 +1032,8 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 	}
 
 	memset(&xecfg, 0, sizeof(xecfg));
-	if ((opt->funcname || opt->funcbody)
-	    && !opt->unmatch_name_only && !opt->status_only &&
-	    !opt->name_only && !binary_match_only && !collect_hits) {
-		struct userdiff_driver *drv = userdiff_find_by_path(name);
-		if (drv && drv->funcname.pattern) {
-			const struct userdiff_funcname *pe = &drv->funcname;
-			xdiff_set_find_func(&xecfg, pe->pattern, pe->cflags);
-			opt->priv = &xecfg;
-		}
-	}
+	opt->priv = &xecfg;
+
 	try_lookahead = should_lookahead(opt);
 
 	while (left) {
@@ -1093,7 +1109,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 				show_function = 1;
 			goto next_line;
 		}
-		if (show_function && match_funcname(opt, bol, eol))
+		if (show_function && match_funcname(opt, name, bol, eol))
 			show_function = 0;
 		if (show_function ||
 		    (last_hit && lno <= last_hit + opt->post_context)) {
diff --git a/grep.h b/grep.h
index a652800..15d227c 100644
--- a/grep.h
+++ b/grep.h
@@ -115,6 +115,7 @@ struct grep_opt {
 	int show_hunk_mark;
 	int file_break;
 	int heading;
+	int use_threads;
 	void *priv;
 
 	void (*output)(struct grep_opt *opt, const void *data, size_t size);
@@ -131,4 +132,10 @@ struct grep_opt {
 extern struct grep_opt *grep_opt_dup(const struct grep_opt *opt);
 extern int grep_threads_ok(const struct grep_opt *opt);
 
+#ifndef NO_PTHREADS
+/* Mutex used around access to the attributes machinery if
+ * opt->use_threads.  Must be initialized/destroyed by callers! */
+extern pthread_mutex_t grep_attr_mutex;
+#endif
+
 #endif
-- 
1.7.8.rc4.388.ge53ab

^ permalink raw reply related

* [PATCH v2 0/3] grep multithreading and scaling
From: Thomas Rast @ 2011-12-02 13:07 UTC (permalink / raw)
  To: René Scharfe; +Cc: Eric Herman, git, Junio C Hamano
In-Reply-To: <201111291507.04754.trast@student.ethz.ch>

[Eric, I measured some numbers that may be interesting to the
discussion about b2924dc.  See below.]

This round wraps up the original patch I posted, plus the draft patch
I posted inline the other day with René's review taken into account.
I also added a patch that rips out threading in the non-worktree case;
read on for the reasoning.

René Scharfe wrote:
> Hmm, why are [gitattributes lookups] that expensive?
> 
> callgrind tells me that userdiff_find_by_path() contributes only 0.18%
> to the total cost with your first patch.  Timings in my virtual machine
> are very volatile, but it seems that here the difference is in the
> system time while user is basically the same for all combinations of
> patches.

Well, turns out I was measuring something completely stupid.  I had

  git grep --cached -W INITRAMFS_ROOT_UID

where I put the --cached originally because that makes it independent
of the worktree (which in the very first measurements I still had
wiped, as I tend to do for this repo; I checked it out again after
that).  This in fact gives me (~/g/git-grep --cached
INITRAMFS_ROOT_UID, leaving aside -W; best of 10):

  THREADS=8:   2.88user 0.21system 0:02.94elapsed
  THREADS=4:   2.89user 0.29system 0:02.99elapsed
  THREADS=2:   2.83user 0.36system 0:02.87elapsed
  NO_PTHREADS: 2.16user 0.08system 0:02.25elapsed

Uhuh.  Doesn't scale so well after all.  But removing the --cached, as
most people probably would:

  THREADS=8:   0.19user 0.32system 0:00.16elapsed
  THREADS=4:   0.16user 0.34system 0:00.17elapsed
  THREADS=2:   0.18user 0.32system 0:00.26elapsed
  NO_PTHREADS: 0.12user 0.17system 0:00.31elapsed

So I conclude that during any grep that cannot use the worktree,
having any threads hurts.

In addition, during a grep that *can* use the worktree, THREADS=8
still helps somewhat on my dual-core i7, though it goes downhill from
there (12 is again as fast as 4; I verified these details using
best-of-50 timings, and it is reproducible.)

I have also run timings on a 2*6-core workstation running OS X, where
performance is best at 5 cores:

  2 threads:  0.96 real   0.41 user   1.27 sys
  3 threads:  0.68 real   0.41 user   1.30 sys
  4 threads:  0.54 real   0.43 user   1.63 sys
  5 threads:  0.50 real   0.41 user   1.51 sys
  6 threads:  0.54 real   0.43 user   1.63 sys
  7 threads:  0.86 real   0.49 user   1.93 sys
  8 threads:  0.98 real   0.51 user   2.07 sys

I kid you not.  That's best-of-50 and rather stable.  It's on the same
tree as the Linux machine too, except for the problem that the OS X FS
is set to case-insensitive and thus cannot represent the tree exactly.
So from git's POV, there are unstaged changes.

Sadly I do not have access to a Linux box having more than 2 physical
cores.  If you have one, please run some tests :-)

So based on my measurements, I would suggest that unless we have
evidence of it scaling beyond 8 cores on some machine, b2924dc (grep:
detect number of CPUs for thread spawning) be dropped.  For now I'm
ignoring the problem that on OS X it doesn't even scale to 8; I'd
rather check how it fares on Linux first.

I added a third patch on top that disables threading in any case that
does not hit the worktree.  I wonder if I missed something or if it
really is that simple.  The neat part is that it's also a reduction in
code required, and at the same time avoids any issues 2/3 might have
with a future attributes-from-trees implementation.

With this I get

  worktree, 8 threads: 0.15user 0.37system 0:00.17elapsed
  --cached, 8 threads: 2.18user 0.07system 0:02.27elapsed

Of course, we could probably gain a huge boost if the read_sha1
machinery could be made threaded, so that it can unpack several
objects at a time.  In addition, I can well imagine that there are
combinations of delta density, object size, and luck where it pays off
to grep in parallel.  Do we care?

Now I really should do something else than fretting over the
sub-second performance of git-grep...

Thomas Rast (3):
  grep: load funcname patterns for -W
  grep: enable threading with -p and -W using lazy attribute lookup
  grep: disable threading in all but worktree case

 builtin/grep.c  |  153 ++++++++++++++++--------------------------------------
 grep.c          |   73 ++++++++++++++++----------
 grep.h          |    7 +++
 t/t7810-grep.sh |   14 +++++
 4 files changed, 112 insertions(+), 135 deletions(-)

-- 
1.7.8.rc4.388.ge53ab

^ permalink raw reply

* [PATCH v2 1/3] grep: load funcname patterns for -W
From: Thomas Rast @ 2011-12-02 13:07 UTC (permalink / raw)
  To: René Scharfe; +Cc: Eric Herman, git, Junio C Hamano
In-Reply-To: <cover.1322830368.git.trast@student.ethz.ch>

git-grep avoids loading the funcname patterns unless they are needed.
ba8ea74 (grep: add option to show whole function as context,
2011-08-01) forgot to extend this test also to the new funcbody
feature.  Do so.

The catch is that we also have to disable threading when using
userdiff, as explained in grep_threads_ok().  So we must be careful to
introduce the same test there.
---
 grep.c          |    7 ++++---
 t/t7810-grep.sh |   14 ++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/grep.c b/grep.c
index b29d09c..7a070e9 100644
--- a/grep.c
+++ b/grep.c
@@ -948,8 +948,8 @@ int grep_threads_ok(const struct grep_opt *opt)
 	 * machinery in grep_buffer_1. The attribute code is not
 	 * thread safe, so we disable the use of threads.
 	 */
-	if (opt->funcname && !opt->unmatch_name_only && !opt->status_only &&
-	    !opt->name_only)
+	if ((opt->funcname || opt->funcbody)
+	    && !opt->unmatch_name_only && !opt->status_only && !opt->name_only)
 		return 0;
 
 	return 1;
@@ -1008,7 +1008,8 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 	}
 
 	memset(&xecfg, 0, sizeof(xecfg));
-	if (opt->funcname && !opt->unmatch_name_only && !opt->status_only &&
+	if ((opt->funcname || opt->funcbody)
+	    && !opt->unmatch_name_only && !opt->status_only &&
 	    !opt->name_only && !binary_match_only && !collect_hits) {
 		struct userdiff_driver *drv = userdiff_find_by_path(name);
 		if (drv && drv->funcname.pattern) {
diff --git a/t/t7810-grep.sh b/t/t7810-grep.sh
index 81263b7..7ba5b16 100755
--- a/t/t7810-grep.sh
+++ b/t/t7810-grep.sh
@@ -523,6 +523,20 @@ test_expect_success 'grep -W' '
 	test_cmp expected actual
 '
 
+cat >expected <<EOF
+hello.c=	printf("Hello world.\n");
+hello.c:	return 0;
+hello.c-	/* char ?? */
+EOF
+
+test_expect_success 'grep -W with userdiff' '
+	test_when_finished "rm -f .gitattributes" &&
+	git config diff.custom.xfuncname "(printf.*|})$" &&
+	echo "hello.c diff=custom" >.gitattributes &&
+	git grep -W return >actual &&
+	test_cmp expected actual
+'
+
 test_expect_success 'grep from a subdirectory to search wider area (1)' '
 	mkdir -p s &&
 	(
-- 
1.7.8.rc4.388.ge53ab

^ permalink raw reply related

* [PATCH v2 3/3] grep: disable threading in all but worktree case
From: Thomas Rast @ 2011-12-02 13:07 UTC (permalink / raw)
  To: René Scharfe; +Cc: Eric Herman, git, Junio C Hamano
In-Reply-To: <cover.1322830368.git.trast@student.ethz.ch>

Measuring grep performance showed that in all but the worktree case
(as opposed to --cached, <committish> or <treeish>), threading
actually slows things down.  For example, on my dual-core
hyperthreaded i7 in a linux-2.6.git at v2.6.37-rc2, I got:

Threads       worktree case                 | --cached case
--------------------------------------------------------------------------
8 (default) | 2.17user 0.15sys 0:02.20real  | 0.11user 0.26sys 0:00.11real
4           | 2.06user 0.17sys 0:02.08real  | 0.11user 0.26sys 0:00.12real
2           | 2.02user 0.25sys 0:02.08real  | 0.15user 0.37sys 0:00.28real
NO_PTHREADS | 1.57user 0.05sys 0:01.64real  | 0.09user 0.12sys 0:00.22real

I conjecture that this is caused by contention on read_sha1_mutex.

So disable threading entirely when not scanning the worktree, to get
the NO_PTHREADS performance in that case.  This obsoletes all code
related to grep_sha1_async.  The thread startup must be delayed until
after all arguments have been parsed, but this does not have a
measurable effect.
---
 builtin/grep.c |  157 ++++++++++++++++----------------------------------------
 1 files changed, 44 insertions(+), 113 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index 65b1ffe..edf6a31 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -34,21 +34,13 @@
 		       const char *name);
 static void *load_file(const char *filename, size_t *sz);
 
-enum work_type {WORK_SHA1, WORK_FILE};
-
 /* We use one producer thread and THREADS consumer
  * threads. The producer adds struct work_items to 'todo' and the
  * consumers pick work items from the same array.
  */
 struct work_item {
-	enum work_type type;
 	char *name;
-
-	/* if type == WORK_SHA1, then 'identifier' is a SHA1,
-	 * otherwise type == WORK_FILE, and 'identifier' is a NUL
-	 * terminated filename.
-	 */
-	void *identifier;
+	char *filename;
 	char done;
 	struct strbuf out;
 };
@@ -86,21 +78,6 @@ static inline void grep_unlock(void)
 		pthread_mutex_unlock(&grep_mutex);
 }
 
-/* Used to serialize calls to read_sha1_file. */
-static pthread_mutex_t read_sha1_mutex;
-
-static inline void read_sha1_lock(void)
-{
-	if (use_threads)
-		pthread_mutex_lock(&read_sha1_mutex);
-}
-
-static inline void read_sha1_unlock(void)
-{
-	if (use_threads)
-		pthread_mutex_unlock(&read_sha1_mutex);
-}
-
 /* Signalled when a new work_item is added to todo. */
 static pthread_cond_t cond_add;
 
@@ -114,7 +91,7 @@ static inline void read_sha1_unlock(void)
 
 static int skip_first_line;
 
-static void add_work(enum work_type type, char *name, void *id)
+static void add_work(char *name, char *filename)
 {
 	grep_lock();
 
@@ -122,9 +99,8 @@ static void add_work(enum work_type type, char *name, void *id)
 		pthread_cond_wait(&cond_write, &grep_mutex);
 	}
 
-	todo[todo_end].type = type;
 	todo[todo_end].name = name;
-	todo[todo_end].identifier = id;
+	todo[todo_end].filename = filename;
 	todo[todo_end].done = 0;
 	strbuf_reset(&todo[todo_end].out);
 	todo_end = (todo_end + 1) % ARRAY_SIZE(todo);
@@ -152,19 +128,10 @@ static void add_work(enum work_type type, char *name, void *id)
 	return ret;
 }
 
-static void grep_sha1_async(struct grep_opt *opt, char *name,
-			    const unsigned char *sha1)
-{
-	unsigned char *s;
-	s = xmalloc(20);
-	memcpy(s, sha1, 20);
-	add_work(WORK_SHA1, name, s);
-}
-
 static void grep_file_async(struct grep_opt *opt, char *name,
 			    const char *filename)
 {
-	add_work(WORK_FILE, name, xstrdup(filename));
+	add_work(name, xstrdup(filename));
 }
 
 static void work_done(struct work_item *w)
@@ -194,7 +161,7 @@ static void work_done(struct work_item *w)
 			write_or_die(1, p, len);
 		}
 		free(w->name);
-		free(w->identifier);
+		free(w->filename);
 	}
 
 	if (old_done != todo_done)
@@ -213,29 +180,18 @@ static void work_done(struct work_item *w)
 
 	while (1) {
 		struct work_item *w = get_work();
+		size_t sz;
+		void* data;
+
 		if (!w)
 			break;
 
 		opt->output_priv = w;
-		if (w->type == WORK_SHA1) {
-			unsigned long sz;
-			void* data = load_sha1(w->identifier, &sz, w->name);
-
-			if (data) {
-				hit |= grep_buffer(opt, w->name, data, sz);
-				free(data);
-			}
-		} else if (w->type == WORK_FILE) {
-			size_t sz;
-			void* data = load_file(w->identifier, &sz);
-			if (data) {
-				hit |= grep_buffer(opt, w->name, data, sz);
-				free(data);
-			}
-		} else {
-			assert(0);
+		data = load_file(w->filename, &sz);
+		if (data) {
+			hit |= grep_buffer(opt, w->name, data, sz);
+			free(data);
 		}
-
 		work_done(w);
 	}
 	free_grep_patterns(arg);
@@ -255,7 +211,6 @@ static void start_threads(struct grep_opt *opt)
 	int i;
 
 	pthread_mutex_init(&grep_mutex, NULL);
-	pthread_mutex_init(&read_sha1_mutex, NULL);
 	pthread_mutex_init(&grep_attr_mutex, NULL);
 	pthread_cond_init(&cond_add, NULL);
 	pthread_cond_init(&cond_write, NULL);
@@ -303,7 +258,6 @@ static int wait_all(void)
 	}
 
 	pthread_mutex_destroy(&grep_mutex);
-	pthread_mutex_destroy(&read_sha1_mutex);
 	pthread_mutex_destroy(&grep_attr_mutex);
 	pthread_cond_destroy(&cond_add);
 	pthread_cond_destroy(&cond_write);
@@ -312,9 +266,6 @@ static int wait_all(void)
 	return hit;
 }
 #else /* !NO_PTHREADS */
-#define read_sha1_lock()
-#define read_sha1_unlock()
-
 static int wait_all(void)
 {
 	return 0;
@@ -371,21 +322,11 @@ static int grep_config(const char *var, const char *value, void *cb)
 	return 0;
 }
 
-static void *lock_and_read_sha1_file(const unsigned char *sha1, enum object_type *type, unsigned long *size)
-{
-	void *data;
-
-	read_sha1_lock();
-	data = read_sha1_file(sha1, type, size);
-	read_sha1_unlock();
-	return data;
-}
-
 static void *load_sha1(const unsigned char *sha1, unsigned long *size,
 		       const char *name)
 {
 	enum object_type type;
-	void *data = lock_and_read_sha1_file(sha1, &type, size);
+	void *data = read_sha1_file(sha1, &type, size);
 
 	if (!data)
 		error(_("'%s': unable to read %s"), name, sha1_to_hex(sha1));
@@ -398,6 +339,9 @@ static int grep_sha1(struct grep_opt *opt, const unsigned char *sha1,
 {
 	struct strbuf pathbuf = STRBUF_INIT;
 	char *name;
+	int hit;
+	unsigned long sz;
+	void *data;
 
 	if (opt->relative && opt->prefix_length) {
 		quote_path_relative(filename + tree_name_len, -1, &pathbuf,
@@ -409,25 +353,15 @@ static int grep_sha1(struct grep_opt *opt, const unsigned char *sha1,
 
 	name = strbuf_detach(&pathbuf, NULL);
 
-#ifndef NO_PTHREADS
-	if (use_threads) {
-		grep_sha1_async(opt, name, sha1);
-		return 0;
-	} else
-#endif
-	{
-		int hit;
-		unsigned long sz;
-		void *data = load_sha1(sha1, &sz, name);
-		if (!data)
-			hit = 0;
-		else
-			hit = grep_buffer(opt, name, data, sz);
+	data = load_sha1(sha1, &sz, name);
+	if (!data)
+		hit = 0;
+	else
+		hit = grep_buffer(opt, name, data, sz);
 
-		free(data);
-		free(name);
-		return hit;
-	}
+	free(data);
+	free(name);
+	return hit;
 }
 
 static void *load_file(const char *filename, size_t *sz)
@@ -586,7 +520,7 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
 			void *data;
 			unsigned long size;
 
-			data = lock_and_read_sha1_file(entry.sha1, &type, &size);
+			data = read_sha1_file(entry.sha1, &type, &size);
 			if (!data)
 				die(_("unable to read tree (%s)"),
 				    sha1_to_hex(entry.sha1));
@@ -616,10 +550,8 @@ static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
 		struct strbuf base;
 		int hit, len;
 
-		read_sha1_lock();
 		data = read_object_with_reference(obj->sha1, tree_type,
 						  &size, NULL);
-		read_sha1_unlock();
 
 		if (!data)
 			die(_("unable to read tree (%s)"), sha1_to_hex(obj->sha1));
@@ -1003,26 +935,6 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
 	if (!opt.fixed && opt.ignore_case)
 		opt.regflags |= REG_ICASE;
 
-#ifndef NO_PTHREADS
-	if (online_cpus() == 1)
-		use_threads = 0;
-#else
-	use_threads = 0;
-#endif
-
-	opt.use_threads = use_threads;
-
-#ifndef NO_PTHREADS
-	if (use_threads) {
-		if (opt.pre_context || opt.post_context || opt.file_break ||
-		    opt.funcbody)
-			skip_first_line = 1;
-		start_threads(&opt);
-	}
-#else
-	use_threads = 0;
-#endif
-
 	compile_grep_patterns(&opt);
 
 	/* Check revs and then paths */
@@ -1044,6 +956,25 @@ int cmd_grep(int argc, const char **argv, const char *prefix)
 		break;
 	}
 
+#ifndef NO_PTHREADS
+	if (online_cpus() == 1 || cached || list.nr)
+		use_threads = 0;
+#else
+	use_threads = 0;
+#endif
+
+	opt.use_threads = use_threads;
+
+#ifndef NO_PTHREADS
+	if (use_threads) {
+		opt.use_threads = use_threads;
+		if (opt.pre_context || opt.post_context || opt.file_break ||
+		    opt.funcbody)
+			skip_first_line = 1;
+		start_threads(&opt);
+	}
+#endif
+
 	/* The rest are paths */
 	if (!seen_dashdash) {
 		int j;
-- 
1.7.8.rc4.388.ge53ab

^ permalink raw reply related

* Re: Suggestion on hashing
From: Nguyen Thai Ngoc Duy @ 2011-12-02 14:22 UTC (permalink / raw)
  To: Bill Zaumen; +Cc: Jeff King, Git Mailing List
In-Reply-To: <1322813319.4340.109.camel@yos>

(I'm not sure why you dropped git@vger. I see nothing private here so
I bring git@vger back)

On Fri, Dec 2, 2011 at 3:08 PM, Bill Zaumen <bill.zaumen@gmail.com> wrote:
> At one point Nguyen said that "What I'm thinking is whether it's
> possible to decouple two sha-1 roles in git, as object identifier
> and digest, separately. Each sha-1 identifies an object and an extra
> set of digests on the "same" object."
>
> My code pretty much does that (it just uses a CRC instead of a real
> digest, but I can easily change that).

It'd be easier to look at your code if you split it into a series of
smaller patches.

> So the question is whether
> using SHA-1 as an ID and SHA-256(?) as a digest is a better long term
> solution than simply replacing SHA-1.

I would not stick with any algorithm permanently. No one knows when
SHA-256 might be broken.

> If there is some interest in pursuing it further, I could make those
> changes fairly easily.  Then you'd have two message digests, a SHA-1
> and a longer one, with the longer one stored parallel to the actual
> object. Then it becomes easy to compute a digest of all the digests
> in a commit's tree and store that in a commit, if that is what you
> want to do.

I personally would like to see how it works out especially when
computing new digests is much more expensive than SHA-1. And I hope
that by delaying computing new digests (stored outside actual
objects), we could make minimum code changes to git. Though security
concerns may be the killer factor and I haven't worked that out yet.

> Replacing SHA-1 with something like SHA-256 sounds easier to implement,

SHA-1 charateristics (like 20 byte length) are hard coded everywhere
in git, it'd be a big audit.

> but the problem is all the existing repositories.  While rewriting all
> the objects and trees to use new hashes is similar to a rebase in most
> cases, there is a complication - submodules.  Git stores the hash of
> a submodule's commit in its tree because a particular revision of
> a project 'goes' with a particular revision of a submodule. But, a
> submodule can exist in one revision and not in the next or previous
> revision  Furthermore A could be a submodule of B at one point in time,
> and many commits later, B could end up being a submodule of A.
> Fixing it up could be pretty complicated (plus having to deal with
> network failures - to update GitHub for example, you'd have to download
> submodules it uses, possibly from somewhere else and some submodules may
> not be publicly accessible (e.g., a private project kept on GitHub but
> with a critical submodule kept in house behind a corporate firewall).
> Also, you might have to update a git repository and its submodules
> concurrently, so that you always can find a new value when you need
> it.
>
> My guess is that this could be far more complicated than what I did.
> Excluding two files that are not used (the symbol PACKDB is not
> defined), I added two new files, crcdb.h and objd-crcdb.c which store
> CRCs for loose objects - 517 lines total including lots of comments in
> the header file - full documentation for each function.  The other
> changes include 1475 lines of new code in previously existing git files
> and 136 deletions (most trivial).  There were also minor changes to
> the makefile and test scripts.

You'd need to convince git maintainer this is worth doing first,
before talking how big the changes are ;-)

> Bill
-- 
Duy

^ permalink raw reply

* Re: Proposal: create meaningful aliases for git reset's hard/soft/mixed
From: Philippe Vaucher @ 2011-12-02 14:27 UTC (permalink / raw)
  To: Phil Hord; +Cc: Junio C Hamano, git, Christian Couder
In-Reply-To: <CABURp0rtCUbJXLHtXv_1g6GRKL3mX-T+3vN1=QO4CUibqXdEMg@mail.gmail.com>

> > Why worse? I'd understand if you said it's doesn't improve it enough
> > for it to be worth the change tho.
>
> I think that's what "you should aim higher" means.

Yes, but my question was why was the proposal _worse_ in his mind.
Anyway, it's not really important, probably something he typed in a
hurry.


> How about:
>  --soft: git checkout -B <commit>
>  --mixed: git reset -- <paths>
>  --hard:  git checkout --clean

I like the idea... but as other pointed out those are not equivalent.

Maybe we'd start by listing the features we want to be able to do:

- Move git's HEAD to a particular commit without touching the files or the index
- Move git's HEAD to a particular commit and clear the index but
without touching the files
- Move git's HEAD to a particular commit and clear the index and have
all the files match that particular commit files
- Move git's HEAD to a particular commit and clear the index and have
all the files match that particular commit files and remove files that
are unknown to that commit

Is there a scenario I'm missing? Once we have the scenarios nailed
down we can start thinking about how to express them.

Philippe

^ permalink raw reply

* Re: Workflow Recommendation - Probably your 1000th
From: Stephen Bash @ 2011-12-02 15:14 UTC (permalink / raw)
  To: bradford; +Cc: git
In-Reply-To: <CAEbKVFQLvyTq+VL9DJZtp4YZLUgeR56N9u5RrsGqEB=e81O3zQ@mail.gmail.com>

----- Original Message -----
> From: "bradford" <fingermark@gmail.com>
> To: "Stephen Bash" <bash@genarts.com>
> Cc: git@vger.kernel.org
> Sent: Thursday, December 1, 2011 3:46:52 PM
> Subject: Re: Workflow Recommendation - Probably your 1000th
> 
> Thanks, Stephen.   I guess I'm looking for more input on the
> advantages and disadvantages of using a QA and production branch vs
> just doing everything out of master.
> 
> Trying to go through the following:
> http://news.ycombinator.com/item?id=1617425
> scottchacon.com/2011/08/31/github-flow.html
> 
> We have some weeks where we release very frequently and some weeks
> where we release only once a week and have to do production fixes in
> the meantime.  Sure other people have similar experiences.

Before continuing I guess two key assumptions factor into our workflow:
 1) we still work in a traditional major/minor release cycle with potentially weeks or even months between releases
 2) our customers can be running almost any historical version of our software

>From that perspective having a maintenance branch for each major revision of our software gives us a holding area where devs can fix bugs at any time without necessarily going through the entire tag/release/merge process (you can envision a "hot fix branch" that is long-lived).  For example, we often have documentation fixes that will sit on the maintenance branch until a software fix needs to go out.  But other non-critical fixes also end up waiting on something that really requires a maintenance release (or enough fixes pile up and necessitate a release themselves).

HTH,
Stephen

^ permalink raw reply

* Re: Proposal: create meaningful aliases for git reset's hard/soft/mixed
From: Phil Hord @ 2011-12-02 15:28 UTC (permalink / raw)
  To: Thomas Rast; +Cc: Philippe Vaucher, Junio C Hamano, git, Christian Couder
In-Reply-To: <201112020826.14114.trast@student.ethz.ch>

On Fri, Dec 2, 2011 at 2:26 AM, Thomas Rast <trast@student.ethz.ch> wrote:
> Phil Hord wrote:
>>
>> Think outside the "reset" command.  Like this:
>>
>> From the "most popular" comment on http://progit.org/2011/07/11/reset.html:
>> > I remember them as:
>> > --soft      -> git uncommit
>> > --mixed  -> git unadd
>> > --hard     -> git undo
>>
>> I don't particular like these names, but conceptually they are helpful.
>
> I think all of these, but the last one in particular, are *very*
> dangerous oversimplifications.  Doubly so if you then use "undo" with
> a revision argument.

I agree.  That's why I also said this:

> How about:
>  --soft: git checkout -B <commit>
>  --mixed: git reset -- <paths>
>  --hard:  git checkout --clean

But maybe I wasn't clear enough.  I'm not suggesting git-alias for
these.  I am proposing new commands to replace common usages of
git-reset.  These commands would need basic safeguards against
foot-shooting, of course.

Phil

^ permalink raw reply

* Re: Proposal: create meaningful aliases for git reset's hard/soft/mixed
From: Phil Hord @ 2011-12-02 15:38 UTC (permalink / raw)
  To: Philippe Vaucher; +Cc: Junio C Hamano, git, Christian Couder
In-Reply-To: <CAGK7Mr7zdstbm7QsrYq9a6m9ui_r8Ak8XtyWADLQ0n-mXiov4w@mail.gmail.com>

On Fri, Dec 2, 2011 at 9:27 AM, Philippe Vaucher
<philippe.vaucher@gmail.com> wrote:
> Maybe we'd start by listing the features we want to be able to do:
>
> - Move git's HEAD to a particular commit without touching the files or the index
> - Move git's HEAD to a particular commit and clear the index but
> without touching the files
> - Move git's HEAD to a particular commit and clear the index and have
> all the files match that particular commit files
> - Move git's HEAD to a particular commit and clear the index and have
> all the files match that particular commit files and remove files that
> are unknown to that commit
>
> Is there a scenario I'm missing? Once we have the scenarios nailed
> down we can start thinking about how to express them.

Aim higher.

Do not think about the git-reset command and all of its features.
Moreover, do not limit yourself to git-reset's functionality.

Think about why you need to use git-reset.  Why do new users need to
use git-reset?  What is it they are after?

For me, it was the three I mentioned before.

So, let's look at yours:

> - Move git's HEAD to a particular commit without touching the files or the index

I know what this is, but I don't know to describe it without saying
"reset".  It's like teleportation.  "Move me to a new location in the
tree".
git teleport <commit>


> - Move git's HEAD to a particular commit and clear the index but
> without touching the files

git teleport --index <commit>


> - Move git's HEAD to a particular commit and clear the index and have
> all the files match that particular commit files

git checkout --clean <commit>


> - Move git's HEAD to a particular commit and clear the index and have
> all the files match that particular commit files and remove files that
> are unknown to that commit

git checkout --clean <commit> && git clean -fd  # maybe this needs a switch?


One you left out is this:
- Do NOT move git's HEAD; clear the index and workdir

git reset


I think the ability to move git's HEAD is what makes reset dangerous,
especially in the hands of new users.

Phil

^ permalink raw reply

* Re: [PATCH v2 3/3] grep: disable threading in all but worktree case
From: René Scharfe @ 2011-12-02 16:15 UTC (permalink / raw)
  To: Thomas Rast; +Cc: Eric Herman, git, Junio C Hamano
In-Reply-To: <5328add8b32f83b4cdbd2e66283f77c125ec127a.1322830368.git.trast@student.ethz.ch>

Am 02.12.2011 14:07, schrieb Thomas Rast:
> Measuring grep performance showed that in all but the worktree case
> (as opposed to --cached,<committish>  or<treeish>), threading
> actually slows things down.  For example, on my dual-core
> hyperthreaded i7 in a linux-2.6.git at v2.6.37-rc2, I got:
>
> Threads       worktree case                 | --cached case
> --------------------------------------------------------------------------
> 8 (default) | 2.17user 0.15sys 0:02.20real  | 0.11user 0.26sys 0:00.11real
> 4           | 2.06user 0.17sys 0:02.08real  | 0.11user 0.26sys 0:00.12real
> 2           | 2.02user 0.25sys 0:02.08real  | 0.15user 0.37sys 0:00.28real
> NO_PTHREADS | 1.57user 0.05sys 0:01.64real  | 0.09user 0.12sys 0:00.22real

Are the columns mixed up?

> I conjecture that this is caused by contention on read_sha1_mutex.

Yeah, and I wonder why we need to have this lock in the first place. In 
theory, multiple readers shouldn't have to affect each other at all, 
right?  The lock could be pushed down into read_sha1_file(), or a 
thread-safe variant of the function added.

In pratice, however, the code in sha1_file.c etc. scares me. ;-)

> So disable threading entirely when not scanning the worktree, to get
> the NO_PTHREADS performance in that case.  This obsoletes all code
> related to grep_sha1_async.  The thread startup must be delayed until
> after all arguments have been parsed, but this does not have a
> measurable effect.

This is a bit radical.  I think the underlying issue that 
read_sha1_file() is not thread-safe can be solved eventually and then 
we'd need to readd that code.

How about adding a parameter to control the number of threads 
(--threads?) instead that defaults to eight (or five) for the worktree 
and one for the rest?  That would also make benchmarking easier.

René

PS: Patches one and three missed a signoff.

^ permalink raw reply

* Re: Git Install link is broken
From: Konstantin Khomoutov @ 2011-12-02 16:16 UTC (permalink / raw)
  To: Graham Wideman; +Cc: git
In-Reply-To: <20111201235608.EGUQ3756.fed1rmfepo203.cox.net@fed1rmimpo306.cox.net>

On Thu, 01 Dec 2011 15:56:12 -0800
Graham Wideman <initcontact@grahamwideman.com> wrote:

> On this page:
> http://code.google.com/p/msysgit/
> 
> the links to "install msysGit" point to:
> https://git.wiki.kernel.org/index.php/MSysGit:InstallMSysGit
> 
> .. which returns page not found error.
http://groups.google.com/group/msysgit/browse_thread/thread/6412cc38d14b612d/37c8653e45dcc14c

^ permalink raw reply

* git auto-repack is broken...
From: Linus Torvalds @ 2011-12-02 16:22 UTC (permalink / raw)
  To: Junio C Hamano, Git Mailing List

I actually tend to repack things pretty religiously (ok, not really,
but I do "git gc" reasonably regularly, so I was surprised to see
thig:

  Auto packing the repository for optimum performance. You may also
  run "git gc" manually. See "git help gc" for more information.

followed by this pitiful effort:

  Counting objects: 8, done.
  Delta compression using up to 4 threads.
  Compressing objects: 100% (8/8), done.
  Writing objects: 100% (8/8), done.
  Total 8 (delta 0), reused 0 (delta 0)

Ok, those 8 objects will *not* help anything at all, and the
autorepack is broken.

So what's going on? It turns out that I have a fair amount of
unreachable objects in this repository, because I do things like
fetching things without then merging them, etc. So the "git gc --auto"
will happily do "git repack -A" or whatever, and that in turn does
*nothing* what-so-ever (or rather, it packs my latest merge commit
like the above and generates that pack of a whopping 8 objects).

I can fix it with "git gc --prune=now", so it's not like I personally
really care, but since the whole point of "git gc --auto" is to allow
people who don't know what they are doing to ignore the whole issue of
GC and pruning, I do think this is a real UI bug.

I don't really have any suggestions for fixing it, though. Maybe we
should make "git gc --auto" remove any unreachable objects? That would
be potentially dangerous in shared repository situations, though. Or
have an extra option to "git repack -A" to also pack any loose objects
it finds at the end (whether reachable or not)?

                         Linus

^ permalink raw reply

* Re: git auto-repack is broken...
From: Ævar Arnfjörð Bjarmason @ 2011-12-02 16:27 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Junio C Hamano, Git Mailing List
In-Reply-To: <CA+55aFznj49hx6Ce6NhJ1rRd2nvNyOERseyyrC6SNcW-z9dyfg@mail.gmail.com>

On Fri, Dec 2, 2011 at 17:22, Linus Torvalds
<torvalds@linux-foundation.org> wrote:

> Maybe we should make "git gc --auto" remove any unreachable objects?

Wouldn't that mean that any loose commit objects you have lying around
would be removed by the automatic git gc?

One feature of git that I personally rely on is that I can liberally
move heads around / make commits on detached heads and not have those
commits gc'd unless I explicitly ask for it for a while.

^ permalink raw reply

* Re: git auto-repack is broken...
From: Linus Torvalds @ 2011-12-02 16:56 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: Junio C Hamano, Git Mailing List
In-Reply-To: <CACBZZX7Q5qb1r_Oh0QfMiWh9UAM1c6QWBn4abv-xHpFBaKuyKg@mail.gmail.com>

On Fri, Dec 2, 2011 at 8:27 AM, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>> Maybe we should make "git gc --auto" remove any unreachable objects?
>
> Wouldn't that mean that any loose commit objects you have lying around
> would be removed by the automatic git gc?
>
> One feature of git that I personally rely on is that I can liberally
> move heads around / make commits on detached heads and not have those
> commits gc'd unless I explicitly ask for it for a while.

Well, with reflogs, you actually do have those objects reachable for
quite a while (90 days by default).

The "unreachable objects" tends to happen when you do fetches without
ever merging the result or actually remove branches (and/or expiring
the reflogs early etc). Not from the normal "use 'git reset' and
friends to move heads around".

That said, I do agree that removing loose objects is the much less
safe approach.

Of course, repacking the objects results in problems too: now you've
entirely lost the age information for that object, so now you cannot
prune it based on age any more.

But leaving the loose objects around and basically failing auto-gc
isn't good either.

                     Linus

^ permalink raw reply

* Re: [PATCH] Implement fast hash-collision detection
From: Jeff King @ 2011-12-02 17:00 UTC (permalink / raw)
  To: Bill Zaumen; +Cc: git, gitster, pclouds, spearce, torvalds
In-Reply-To: <1322794744.1673.494.camel@yos>

On Thu, Dec 01, 2011 at 06:59:04PM -0800, Bill Zaumen wrote:

> > What about the server being more clever about hiding the replacement
> > object? E.g., instead of just breaking into kernel.org and inserting a
> > replacement object, the attacker runs a malicious git-daemon that
> > returns the bogus object to cloners, but the real object to fetchers.
> 
> That's really a server-security issue, not a git one.  Perhaps
> repositories should be configured so that all the executables are on
> read-only partitions.  It's an important question in general of
> course, but it is probably useful to distinguish attacks that put
> bad data on a server from ones that install new software.

I don't agree here. You have to assume that the attacker will ignore
attacks you have blocked, but continue with ones you haven't (just to
counter your example, why not replace the running git-daemon
in-memory?).

You can target the narrow window of attacks that compromise the on-disk
repository without being able to execute arbitrary code. But I don't see
a point. After the kernel.org hack, yes, people are interested in
hardening kernel.org. But they are much more interested in cryptographic
sources of authority that let us not have to trust kernel.org at all.
Having some weird half-way trust just complicates things.

> > But we can already do that. Assume you have an existing repo "foo". To
> > verify the copy at git://example.com/foo.git, do a fresh clone to
> > "bar", and then compare the objects in "foo" to "bar", either byte-wise
> > or by digest.
> 
> Of course, but that is an expensive operation - in the case of Git
> transferring some 50 MBytes of data per repository.  A command to
> fetch the SHA-1 ID and a CRC or message digest for each object would
> not only run faster, but should put a much lower load on the server.

Yes, it is more expensive. But again, my threat model is that the server
is not trusted to serve data accurately or consistently. So you can't
come to the server and say "Hey, I'm doing a security verification. Can
you send me the CRCs?" You _have_ to present yourself as one of the
victims to be infected by the bad object, or a smart attacker will send
you the unmodified data.

> Getting back to the birthday attack question (this is an area where
> your comments were very useful for me), there's a case I didn't
> consider.
> [elaborate birthday attack scenario]

>From my quick reading of your scenario, yes, that is a possible attack.
To me, though, it just highlights the need for either a non-colliding
algorithm, or for better trust verification about the authors of objects
(i.e., cryptographically strong trust).

-Peff

^ permalink raw reply

* Re: git auto-repack is broken...
From: Jeff King @ 2011-12-02 17:10 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ævar Arnfjörð Bjarmason, Junio C Hamano,
	Git Mailing List
In-Reply-To: <CA+55aFyq28vmo9dk-5mVm+nNn86qSjNT6VJGc09iaJo=+OP1Sg@mail.gmail.com>

On Fri, Dec 02, 2011 at 08:56:34AM -0800, Linus Torvalds wrote:

> On Fri, Dec 2, 2011 at 8:27 AM, Ævar Arnfjörð Bjarmason
> <avarab@gmail.com> wrote:
> >
> >> Maybe we should make "git gc --auto" remove any unreachable objects?
> >
> > Wouldn't that mean that any loose commit objects you have lying around
> > would be removed by the automatic git gc?
> >
> > One feature of git that I personally rely on is that I can liberally
> > move heads around / make commits on detached heads and not have those
> > commits gc'd unless I explicitly ask for it for a while.
> 
> Well, with reflogs, you actually do have those objects reachable for
> quite a while (90 days by default).
> 
> The "unreachable objects" tends to happen when you do fetches without
> ever merging the result or actually remove branches (and/or expiring
> the reflogs early etc). Not from the normal "use 'git reset' and
> friends to move heads around".
> 
> That said, I do agree that removing loose objects is the much less
> safe approach.

We do remove loose objects that are totally unreferenced, but there is
still a time-delay, because we don't want to prune something like an
in-progress commit operation. The default delay for that is 2 weeks,
which I think is an arbitrary number that was "wow, if your git
operation takes longer than this, you're way too patient".

And in general, it works OK because people don't tend to accumulate more
than the auto-gc number of objects within a 2 week period. So perhaps
you're just special in your usage patterns.

One solution is just dropping that "2 weeks" down to something smaller,
but still conservative (say, 3 days?).

If you still have the repo in question, what is the date breakdown on
your loose objects?

> Of course, repacking the objects results in problems too: now you've
> entirely lost the age information for that object, so now you cannot
> prune it based on age any more.

When the objects become unreferenced, we eject them from the pack into
loose form again. If they don't become referenced in the 2-week window,
they get pruned then. So yes, you drop the age information, but they do
eventually go away.

-Peff

^ permalink raw reply

* Re: [PATCH v2 0/3] grep multithreading and scaling
From: Jeff King @ 2011-12-02 17:34 UTC (permalink / raw)
  To: Thomas Rast; +Cc: René Scharfe, Eric Herman, git, Junio C Hamano
In-Reply-To: <cover.1322830368.git.trast@student.ethz.ch>

On Fri, Dec 02, 2011 at 02:07:45PM +0100, Thomas Rast wrote:

> where I put the --cached originally because that makes it independent
> of the worktree (which in the very first measurements I still had
> wiped, as I tend to do for this repo; I checked it out again after
> that).  This in fact gives me (~/g/git-grep --cached
> INITRAMFS_ROOT_UID, leaving aside -W; best of 10):
> 
>   THREADS=8:   2.88user 0.21system 0:02.94elapsed
>   THREADS=4:   2.89user 0.29system 0:02.99elapsed
>   THREADS=2:   2.83user 0.36system 0:02.87elapsed
>   NO_PTHREADS: 2.16user 0.08system 0:02.25elapsed
> 
> Uhuh.  Doesn't scale so well after all.  But removing the --cached, as
> most people probably would:
> 
>   THREADS=8:   0.19user 0.32system 0:00.16elapsed
>   THREADS=4:   0.16user 0.34system 0:00.17elapsed
>   THREADS=2:   0.18user 0.32system 0:00.26elapsed
>   NO_PTHREADS: 0.12user 0.17system 0:00.31elapsed
> 
> So I conclude that during any grep that cannot use the worktree,
> having any threads hurts.

Wow, that's horrible. Leaving aside the parallelism, it's just terrible
that reading from the cache is 20 times slower than the worktree. I get
similar results on my quad-core machine.

A quick perf run shows most of the time is spent inflating objects. The
diff code has a sneaky trick to re-use worktree files when we know they
are stat-clean (in diff's case it is to avoid writing a tempfile). I
wonder if we should use the same trick here.

It would hurt the cold cache case, though, as the compressed versions
require fewer disk accesses, of course.

-Peff

PS I suspect your timings are somewhat affected by the simplicity of the
   regex you are asking for. The time to inflate the blobs dominates,
   because the search is just a memmem(). On my quad-core w/
   hyperthreading (i.e., 8 apparent cores):

   [no caching, simple regex; we get some parallelism, but the regex
    task is just not that intensive]
   $ /usr/bin/time git grep INITRAMFS_ROOT_UID >/dev/null
   0.42user 0.45system 0:00.15elapsed 578%CPU

   [no caching, harder regex; we get much higher CPU utilization]
   $ /usr/bin/time git grep 'a.*b' >/dev/null
   14.68user 0.50system 0:02.00elapsed 758%CPU

   [with caching, simple regex; we get almost _no_ parallelism because
    all of our time is spent deflating under a lock, and the regex task
    takes very little time]
   $ /usr/bin/time git grep --cached INITRAMFS_ROOT_UID >/dev/null
   7.64user 0.41system 0:07.61elapsed 105%CPU

   [with caching, harder regex; not as much parallelism as we hoped for,
    but still much more than before. Because there is actually work to
    parallelize in the regex]
   $ /usr/bin/time git grep --cached 'a.*b' >/dev/null
   23.46user 0.47system 0:08.42elapsed 284%CPU

   So I think there is value in parallelizing even --cached greps. But
   we could do so much better if blob inflation could be done in
   parallel.

^ permalink raw reply

* Re: git auto-repack is broken...
From: Junio C Hamano @ 2011-12-02 17:35 UTC (permalink / raw)
  To: Jeff King
  Cc: Linus Torvalds, Ævar Arnfjörð Bjarmason,
	Junio C Hamano, Git Mailing List
In-Reply-To: <20111202171017.GB23447@sigill.intra.peff.net>

Jeff King <peff@peff.net> writes:

> When the objects become unreferenced, we eject them from the pack into
> loose form again. If they don't become referenced in the 2-week window,
> they get pruned then. So yes, you drop the age information, but they do
> eventually go away.

If you update gc/repack -A to put them in a separate pack, then you would
never be able to get rid of them, no? You pack, then eject (which gives
them a fresher timestamp), then notice that you are within the 2-week window
and pack them again,...

^ permalink raw reply

* Re: git auto-repack is broken...
From: Jeff King @ 2011-12-02 17:45 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Linus Torvalds, Ævar Arnfjörð Bjarmason,
	Git Mailing List
In-Reply-To: <7vobvqoozr.fsf@alter.siamese.dyndns.org>

On Fri, Dec 02, 2011 at 09:35:52AM -0800, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > When the objects become unreferenced, we eject them from the pack into
> > loose form again. If they don't become referenced in the 2-week window,
> > they get pruned then. So yes, you drop the age information, but they do
> > eventually go away.
> 
> If you update gc/repack -A to put them in a separate pack, then you would
> never be able to get rid of them, no? You pack, then eject (which gives
> them a fresher timestamp), then notice that you are within the 2-week window
> and pack them again,...

But we shouldn't be packing totally unreferenced objects. Barring bugs,
the life cycle of such an object should be something like:

  1. Object X is created on branch 'foo'.

  2. Branch 'foo' is deleted, but its commits are still in the HEAD
     reflog, referencing X.

  3. 90 days pass (actually, I think this might be the 30-day
     expire-unreachable time)

  4. "git gc" runs "git repack -Ad", which will eject X from the pack
     into a loose form (because it is not becoming part of the new pack
     we are writing).

  5. Two weeks pass.

  6. "git gc" runs "git prune --expire=2.weeks.ago", which removes the
     object.

"gc" runs between (4) and (6) will not re-pack the object, because it
remains unreferenced.

I think things might be slowed somewhat by "gc --auto", which will not
do a "repack -A" until we have too many packs. So steps (3) and (4) are
really more like "gc runs git-repack without -A" 50 times, and then we
finally run "git repack -A".

-Peff

^ permalink raw reply

* Re: Suggestion on hashing
From: Jeff King @ 2011-12-02 17:54 UTC (permalink / raw)
  To: Bill Zaumen; +Cc: git, pclouds
In-Reply-To: <1322813319.4340.109.camel@yos>

On Fri, Dec 02, 2011 at 12:08:39AM -0800, Bill Zaumen wrote:

> At one point Nguyen said that "What I'm thinking is whether it's
> possible to decouple two sha-1 roles in git, as object identifier
> and digest, separately. Each sha-1 identifies an object and an extra
> set of digests on the "same" object."
> 
> My code pretty much does that (it just uses a CRC instead of a real
> digest, but I can easily change that).   So the question is whether
> using SHA-1 as an ID and SHA-256(?) as a digest is a better long term
> solution than simply replacing SHA-1.

I think your code is solving the wrong problem (or solving the right
problem in a half-way manner). The only things that make sense to me
are:

  1. Do nothing. SHA-1 is probably not broken yet, even by the NSA, and
     even if it is, an attack is extremely expensive to mount. This may
     change in the future, of course, but it will probably stay
     expensive for a while.

  2. Decouple the object identifier and digest roles, but insert the
     digest into newly created objects, so it can be part of the
     signature chain. I described such a scheme in one of my replies to
     you. It has some complexities, but has the bonus that we can build
     directly on older history, preserving its sha1s.

  3. Replace SHA-1 with a more secure algorithm.

I'm probably in favor of (1) at this point. Whether to do (2) or (3)
will depend on where we are when SHA-1 gets feasibly broken. It may be
many years away, at which point we may be considering a git 2.0 that
breaks repository compatibility, anyway. That would be a natural time to
consider changing the algorithm.

> Replacing SHA-1 with something like SHA-256 sounds easier to implement,
> but the problem is all the existing repositories.

Right. I don't think anyone is denying that it would be a giant pain.

-Peff

^ permalink raw reply

* Re: git auto-repack is broken...
From: Junio C Hamano @ 2011-12-02 18:08 UTC (permalink / raw)
  To: Jeff King
  Cc: Linus Torvalds, Ævar Arnfjörð Bjarmason,
	Git Mailing List
In-Reply-To: <20111202174546.GA24093@sigill.intra.peff.net>

Jeff King <peff@peff.net> writes:

> But we shouldn't be packing totally unreferenced objects.

Everything you said is correct in today's Git and I obviously know it, but
I was taking the "Or have an extra option to..." at the end of the OP's
message in the thread into account, so...

^ permalink raw reply

* Re: Suggestion on hashing
From: Jeff King @ 2011-12-02 18:09 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: Bill Zaumen, Git Mailing List
In-Reply-To: <CACsJy8CO1GtpZVo-oA2eKbQadsXYBEKVLfUH0GONR5jovuvH+Q@mail.gmail.com>

On Fri, Dec 02, 2011 at 09:22:31PM +0700, Nguyen Thai Ngoc Duy wrote:

> > So the question is whether
> > using SHA-1 as an ID and SHA-256(?) as a digest is a better long term
> > solution than simply replacing SHA-1.
> 
> I would not stick with any algorithm permanently. No one knows when
> SHA-256 might be broken.

Yeah, you could stick a few bits of algorithm parameter in the beginning
of each identifier. It would mean unique hashes get one character or so
longer (and they would all start with "1", or whatever the identifier
is).

SHA-256 doesn't suffer from SHA-1's problems, though they are based on
related constructions, so I think there is some concern that it may
eventually fail in the same way. SHA-3 is a better bet in that sense,
but it will also be very unproven, even once it is actually
standardized.

> > Replacing SHA-1 with something like SHA-256 sounds easier to implement,
> 
> SHA-1 charateristics (like 20 byte length) are hard coded everywhere
> in git, it'd be a big audit.

In theory, you could truncate a longer hash to 160-bits. It's not the
bit-strength of SHA-1 that is the problem, but the attacks on the
algorithm itself which reduce the bit-strength to something too low.
I would think a truncated result would retain the same cryptographic
properties, as one of the properties of the un-truncated hash is that
changes in the input data are reflected throughout the hash. Some
hashes, like Skein, explicitly have a big internal state, and then just
let you output as many bytes as is appropriate (i.e., being a drop-in
replacement for SHA-1 is an explicit goal).

But I'm not a cryptographer, so there may be some subtle issues with
doing that to arbitrary hash functions.

-Peff

^ permalink raw reply

* Re: git auto-repack is broken...
From: Jeff King @ 2011-12-02 18:13 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Linus Torvalds, Ævar Arnfjörð Bjarmason,
	Git Mailing List
In-Reply-To: <7vd3c6onhs.fsf@alter.siamese.dyndns.org>

On Fri, Dec 02, 2011 at 10:08:15AM -0800, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > But we shouldn't be packing totally unreferenced objects.
> 
> Everything you said is correct in today's Git and I obviously know it, but
> I was taking the "Or have an extra option to..." at the end of the OP's
> message in the thread into account, so...

Ah, sorry, I missed the subtlety of Linus's "repacking the objects
results in problems..." from his later message and thought he just meant
repacking in general. Yes, it's a bad idea to repack unreachable objects
because then you could never prune anything.

I think just shrinking the --expire window that we already use is a much
more reasonable bet. It's not about preventing the loss of old work
(reflogs are there for that), but about avoiding hurting an actively
running, about-to-reference-the-objects git process. And 2 weeks is
quite conservative for that.

-Peff

^ permalink raw reply

* Re: [PATCH v2 0/3] grep multithreading and scaling
From: Eric Herman @ 2011-12-02 20:02 UTC (permalink / raw)
  To: Thomas Rast; +Cc: René Scharfe, git, Junio C Hamano
In-Reply-To: <cover.1322830368.git.trast@student.ethz.ch>

Hello Thomas,

Thanks for the work and the great info.
Some of the numbers are quite surprising.

I do, indeed, have a machine with more cores, but I have been either 
busy with out-of-town guests or generally plain lazy in the last couple 
of weeks. I intend to set aside some time to do some benchmarking this 
weekend.

I'll let you know what I find.

Cheers,
  -Eric

-- 
http://www.freesa.org/ -- mobile: +31 620719662
aim: ericigps -- skype: eric_herman -- jabber: eric.herman@gmail.com

^ permalink raw reply

* [ANNOUNCE] Git 1.7.8
From: Junio C Hamano @ 2011-12-02 20:25 UTC (permalink / raw)
  To: git; +Cc: Linux Kernel

The latest feature release Git 1.7.8 is available.

The release tarballs are found at:

    http://code.google.com/p/git-core/downloads/list

and their SHA-1 checksums are:

7453e737e008f7319a5eca24a9ef3c5fb1f13398  git-1.7.8.tar.gz
2734079e22a0a6e3e78779582be9138ffc7de6f7  git-htmldocs-1.7.8.tar.gz
93315f7f51d7f27d3e421c9b0d64afa27f3d16df  git-manpages-1.7.8.tar.gz

Also the following public repositories all have a copy of the v1.7.8
tag and the master branch that the tag points at:

  url = git://repo.or.cz/alt-git.git
  url = https://code.google.com/p/git-core/
  url = git://git.sourceforge.jp/gitroot/git-core/git.git
  url = git://git-core.git.sourceforge.net/gitroot/git-core/git-core
  url = https://github.com/gitster/git

Git v1.7.8 Release Notes
========================

Updates since v1.7.7
--------------------

 * Some git-svn, git-gui, git-p4 (in contrib) and msysgit updates.

 * Updates to bash completion scripts.

 * The build procedure has been taught to take advantage of computed
   dependency automatically when the complier supports it.

 * The date parser now accepts timezone designators that lack minutes
   part and also has a colon between "hh:mm".

 * The contents of the /etc/mailname file, if exists, is used as the
   default value of the hostname part of the committer/author e-mail.

 * "git am" learned how to read from patches generated by Hg.

 * "git archive" talking with a remote repository can report errors
   from the remote side in a more informative way.

 * "git branch" learned an explicit --list option to ask for branches
   listed, optionally with a glob matching pattern to limit its output.

 * "git check-attr" learned "--cached" option to look at .gitattributes
   files from the index, not from the working tree.

 * Variants of "git cherry-pick" and "git revert" that take multiple
   commits learned to "--continue" and "--abort".

 * "git daemon" gives more human readble error messages to clients
   using ERR packets when appropriate.

 * Errors at the network layer is logged by "git daemon".

 * "git diff" learned "--minimal" option to spend extra cycles to come
   up with a minimal patch output.

 * "git diff" learned "--function-context" option to show the whole
   function as context that was affected by a change.

 * "git difftool" can be told to skip launching the tool for a path by
   answering 'n' to its prompt.

 * "git fetch" learned to honor transfer.fsckobjects configuration to
   validate the objects that were received from the other end, just like
   "git receive-pack" (the receiving end of "git push") does.

 * "git fetch" makes sure that the set of objects it received from the
   other end actually completes the history before updating the refs.
   "git receive-pack" (the receiving end of "git push") learned to do the
   same.

 * "git fetch" learned that fetching/cloning from a regular file on the
   filesystem is not necessarily a request to unpack a bundle file; the
   file could be ".git" with "gitdir: <path>" in it.

 * "git for-each-ref" learned "%(contents:subject)", "%(contents:body)"
   and "%(contents:signature)". The last one is useful for signed tags.

 * "git grep" used to incorrectly pay attention to .gitignore files
   scattered in the directory it was working in even when "--no-index"
   option was used. It no longer does this. The "--exclude-standard"
   option needs to be given to explicitly activate the ignore
   mechanism.

 * "git grep" learned "--untracked" option, where given patterns are
    searched in untracked (but not ignored) files as well as tracked
    files in the working tree, so that matches in new but not yet
    added files do not get missed.

 * The recursive merge backend no longer looks for meaningless
   existing merges in submodules unless in the outermost merge.

 * "git log" and friends learned "--children" option.

 * "git ls-remote" learned to respond to "-h"(elp) requests.

 * "mediawiki" remote helper can interact with (surprise!) MediaWiki
   with "git fetch" & "git push".

 * "git merge" learned the "--edit" option to allow users to edit the
   merge commit log message.

 * "git rebase -i" can be told to use special purpose editor suitable
   only for its insn sheet via sequence.editor configuration variable.

 * "git send-email" learned to respond to "-h"(elp) requests.

 * "git send-email" allows the value given to sendemail.aliasfile to begin
   with "~/" to refer to the $HOME directory.

 * "git send-email" forces use of Authen::SASL::Perl to work around
   issues between Authen::SASL::Cyrus and AUTH PLAIN/LOGIN.

 * "git stash" learned "--include-untracked" option to stash away
   untracked/ignored cruft from the working tree.

 * "git submodule clone" does not leak an error message to the UI
   level unnecessarily anymore.

 * "git submodule update" learned to honor "none" as the value for
   submodule.<name>.update to specify that the named submodule should
   not be checked out by default.

 * When populating a new submodule directory with "git submodule init",
   the $GIT_DIR metainformation directory for submodules is created inside
   $GIT_DIR/modules/<name>/ directory of the superproject and referenced
   via the gitfile mechanism. This is to make it possible to switch
   between commits in the superproject that has and does not have the
   submodule in the tree without re-cloning.

 * "gitweb" leaked unescaped control characters from syntax hiliter
   outputs.

 * "gitweb" can be told to give custom string at the end of the HTML
   HEAD element.

 * "gitweb" now has its own manual pages.

Also contains other documentation updates and minor code cleanups.

Fixes since v1.7.7
------------------

Unless otherwise noted, all fixes in the 1.7.7.X maintenance track are
included in this release.

 * HTTP transport did not use pushurl correctly, and also did not tell
   what host it is trying to authenticate with when asking for
   credentials.
   (merge deba493 jk/http-auth later to maint).

 * "git blame" was aborted if started from an uncommitted content and
   the path had the textconv filter in effect.
   (merge 8518088 ss/blame-textconv-fake-working-tree later to maint).

 * Adding many refs to the local repository in one go (e.g. "git fetch"
   that fetches many tags) and looking up a ref by name in a repository
   with too many refs were unnecessarily slow.
   (merge 17d68a54d jp/get-ref-dir-unsorted later to maint).

 * Report from "git commit" on untracked files was confused under
   core.ignorecase option.
   (merge 395c7356 jk/name-hash-dirent later to maint).

 * "git merge" did not understand ":/<pattern>" as a way to name a commit.

 " "git push" on the receiving end used to call post-receive and post-update
   hooks for attempted removal of non-existing refs.
   (merge 160b81ed ph/push-to-delete-nothing later to maint).

 * Help text for "git remote set-url" and "git remote set-branches"
   were misspelled.
   (merge c49904e fc/remote-seturl-usage-fix later to maint).
   (merge 656cdf0 jc/remote-setbranches-usage-fix later to maint).

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox