git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Derrick Stolee <stolee@gmail.com>
To: Lidong Yan <yldhome2d2@gmail.com>
Cc: 502024330056@smail.nju.edu.cn, git@vger.kernel.org,
	gitster@pobox.com, toon@iotcl.com
Subject: [PATCH v5.1 4/4] bloom: optimize multiple pathspec items in revision
Date: Thu, 10 Jul 2025 09:55:15 -0400	[thread overview]
Message-ID: <2619038e-05f5-4af8-bb20-e4e01138f839@gmail.com> (raw)
In-Reply-To: <20250710084829.2171855-5-502024330056@smail.nju.edu.cn>

On 7/10/2025 4:48 AM, Lidong Yan wrote:
> @@ -710,23 +709,26 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
>  	if (!revs->pruning.pathspec.nr)
>  		return;
>  
> -	revs->bloom_keyvecs_nr = 1;
> -	CALLOC_ARRAY(revs->bloom_keyvecs, 1);
> -	pi = &revs->pruning.pathspec.items[0];
> +	revs->bloom_keyvecs_nr = revs->pruning.pathspec.nr;
> +	CALLOC_ARRAY(revs->bloom_keyvecs, revs->bloom_keyvecs_nr);
> +	for (int i = 0; i < revs->pruning.pathspec.nr; i++) {
> +		pi = &revs->pruning.pathspec.items[i];
>  
> -	/* remove single trailing slash from path, if needed */
> -	if (pi->len > 0 && pi->match[pi->len - 1] == '/') {
> -		path_alloc = xmemdupz(pi->match, pi->len - 1);
> -		path = path_alloc;
> -	} else
> -		path = pi->match;
> +		/* remove single trailing slash from path, if needed */
> +		if (pi->len > 0 && pi->match[pi->len - 1] == '/') {
> +			path_alloc = xmemdupz(pi->match, pi->len - 1);
> +			path = path_alloc;
> +		} else
> +			path = pi->match;
>  
> -	len = strlen(path);
> -	if (!len)
> -		goto fail;
> +		len = strlen(path);
> +		if (!len)
> +			goto fail;
>  
> -	revs->bloom_keyvecs[0] =
> -		bloom_keyvec_new(path, len, revs->bloom_filter_settings);
> +		revs->bloom_keyvecs[i] =
> +			bloom_keyvec_new(path, len, revs->bloom_filter_settings);
> +		FREE_AND_NULL(path_alloc);
> +	}

Focus on the change to this diff when the patch below is applied on
top of the 3.5/4 I sent earlier, resulting in this diff:

@@ -733,13 +732,14 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
 	if (!revs->pruning.pathspec.nr)
 		return;
 
-	revs->bloom_keyvecs_nr = 1;
-	CALLOC_ARRAY(revs->bloom_keyvecs, 1);
-
-	if (convert_pathspec_to_filter(&revs->pruning.pathspec.items[0],
-				       &revs->bloom_keyvecs[0],
-				       revs->bloom_filter_settings))
-		goto fail;
+	revs->bloom_keyvecs_nr = revs->pruning.pathspec.nr;
+	CALLOC_ARRAY(revs->bloom_keyvecs, revs->bloom_keyvecs_nr);
+	for (int i = 0; i < revs->pruning.pathspec.nr; i++) {
+		if (convert_pathspec_to_filter(&revs->pruning.pathspec.items[i],
+					       &revs->bloom_keyvecs[i],
+					       revs->bloom_filter_settings))
+			goto fail;
+	}
 
 	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
 		atexit(trace2_bloom_filter_statistics_atexit);

Also, I've included the hyperfine performance output in the commit
message.

Thanks,
-Stolee


--- >8 ---

From fe255b1acfbe90fa8e4c335435ae18ee95e6243c Mon Sep 17 00:00:00 2001
From: Lidong Yan <502024330056@smail.nju.edu.cn>
Date: Thu, 10 Jul 2025 08:04:34 -0400
Subject: [PATCH v5.1 4/4] bloom: optimize multiple pathspec items in revision
 traversal

To enable optimize multiple pathspec items in revision traversal,
return 0 if all pathspec item is literal in forbid_bloom_filters().
Add for loops to initialize and check each pathspec item's bloom_keyvec
when optimization is possible.

Add new test cases in t/t4216-log-bloom.sh to ensure
  - consistent results between the optimization for multiple pathspec
    items using bloom filter and the case without bloom filter
    optimization.
  - does not use bloom filter if any pathspec item is not literal.

With these optimizations, we get some improvements for multi-pathspec runs
of 'git log'. First, in the Git repository we see these modest results:

Benchmark 1: old
  Time (mean ± σ):      73.1 ms ±   2.9 ms
  Range (min … max):    69.9 ms …  84.5 ms    42 runs

Benchmark 2: new
  Time (mean ± σ):      55.1 ms ±   2.9 ms
  Range (min … max):    51.1 ms …  61.2 ms    52 runs

Summary
  'new' ran
    1.33 ± 0.09 times faster than 'old'

But in a larger repo, such as the LLVM project repo below, we get even
better results:

Benchmark 1: old
  Time (mean ± σ):      1.974 s ±  0.006 s
  Range (min … max):    1.960 s …  1.983 s    10 runs

Benchmark 2: new
  Time (mean ± σ):     262.9 ms ±   2.4 ms
  Range (min … max):   257.7 ms … 266.2 ms    11 runs

Summary
  'new' ran
    7.51 ± 0.07 times faster than 'old'

Signed-off-by: Lidong Yan <502024330056@smail.nju.edu.cn>
Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 revision.c           | 22 +++++++++++-----------
 t/t4216-log-bloom.sh | 23 ++++++++++++++---------
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/revision.c b/revision.c
index 4c09b594c55..ca8c1dde8ca 100644
--- a/revision.c
+++ b/revision.c
@@ -675,12 +675,11 @@ static int forbid_bloom_filters(struct pathspec *spec)
 {
 	if (spec->has_wildcard)
 		return 1;
-	if (spec->nr > 1)
-		return 1;
 	if (spec->magic & ~PATHSPEC_LITERAL)
 		return 1;
-	if (spec->nr && (spec->items[0].magic & ~PATHSPEC_LITERAL))
-		return 1;
+	for (size_t nr = 0; nr < spec->nr; nr++)
+		if (spec->items[nr].magic & ~PATHSPEC_LITERAL)
+			return 1;
 
 	return 0;
 }
@@ -733,13 +732,14 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
 	if (!revs->pruning.pathspec.nr)
 		return;
 
-	revs->bloom_keyvecs_nr = 1;
-	CALLOC_ARRAY(revs->bloom_keyvecs, 1);
-
-	if (convert_pathspec_to_filter(&revs->pruning.pathspec.items[0],
-				       &revs->bloom_keyvecs[0],
-				       revs->bloom_filter_settings))
-		goto fail;
+	revs->bloom_keyvecs_nr = revs->pruning.pathspec.nr;
+	CALLOC_ARRAY(revs->bloom_keyvecs, revs->bloom_keyvecs_nr);
+	for (int i = 0; i < revs->pruning.pathspec.nr; i++) {
+		if (convert_pathspec_to_filter(&revs->pruning.pathspec.items[i],
+					       &revs->bloom_keyvecs[i],
+					       revs->bloom_filter_settings))
+			goto fail;
+	}
 
 	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
 		atexit(trace2_bloom_filter_statistics_atexit);
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 8910d53cac1..639868ac562 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -66,8 +66,9 @@ sane_unset GIT_TRACE2_CONFIG_PARAMS
 
 setup () {
 	rm -f "$TRASH_DIRECTORY/trace.perf" &&
-	git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom &&
-	GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom
+	eval git -c core.commitGraph=false log --pretty="format:%s" "$1" >log_wo_bloom &&
+	eval "GIT_TRACE2_PERF=\"$TRASH_DIRECTORY/trace.perf\"" \
+		git -c core.commitGraph=true log --pretty="format:%s" "$1" >log_w_bloom
 }
 
 test_bloom_filters_used () {
@@ -138,10 +139,6 @@ test_expect_success 'git log with --walk-reflogs does not use Bloom filters' '
 	test_bloom_filters_not_used "--walk-reflogs -- A"
 '
 
-test_expect_success 'git log -- multiple path specs does not use Bloom filters' '
-	test_bloom_filters_not_used "-- file4 A/file1"
-'
-
 test_expect_success 'git log -- "." pathspec at root does not use Bloom filters' '
 	test_bloom_filters_not_used "-- ."
 '
@@ -151,9 +148,17 @@ test_expect_success 'git log with wildcard that resolves to a single path uses B
 	test_bloom_filters_used "-- *renamed"
 '
 
-test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses Bloom filters' '
-	test_bloom_filters_not_used "-- *" &&
-	test_bloom_filters_not_used "-- file*"
+test_expect_success 'git log with multiple literal paths uses Bloom filter' '
+	test_bloom_filters_used "-- file4 A/file1" &&
+	test_bloom_filters_used "-- *" &&
+	test_bloom_filters_used "-- file*"
+'
+
+test_expect_success 'git log with path contains a wildcard does not use Bloom filter' '
+	test_bloom_filters_not_used "-- file\*" &&
+	test_bloom_filters_not_used "-- A/\* file4" &&
+	test_bloom_filters_not_used "-- file4 A/\*" &&
+	test_bloom_filters_not_used "-- * A/\*"
 '
 
 test_expect_success 'setup - add commit-graph to the chain without Bloom filters' '
-- 
2.47.2.vfs.0.2



  parent reply	other threads:[~2025-07-10 13:55 UTC|newest]

Thread overview: 72+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-25 12:55 [PATCH 0/2] bloom: use bloom filter given multiple pathspec Lidong Yan
2025-06-25 12:55 ` [PATCH 1/2] bloom: replace struct bloom_key * with struct bloom_keyvec Lidong Yan
2025-06-25 17:43   ` Junio C Hamano
2025-06-26  3:44     ` Lidong Yan
2025-06-25 12:55 ` [PATCH 2/2] bloom: enable multiple pathspec bloom keys Lidong Yan
2025-06-27 13:50   ` Junio C Hamano
2025-06-27 14:24     ` Lidong Yan
2025-06-27 18:09       ` Junio C Hamano
2025-07-01  5:52     ` Lidong Yan
2025-07-01 15:19       ` Junio C Hamano
2025-07-02  7:14         ` Lidong Yan
2025-07-02 15:48           ` Junio C Hamano
2025-07-03  1:52             ` Lidong Yan
2025-07-04 12:09             ` Lidong Yan
2025-07-01  8:50     ` SZEDER Gábor
2025-07-01 11:40       ` Lidong Yan
2025-07-01 15:43       ` Junio C Hamano
2025-06-27 20:39   ` Junio C Hamano
2025-06-28  2:54     ` Lidong Yan
2025-06-25 17:32 ` [PATCH 0/2] bloom: use bloom filter given multiple pathspec Junio C Hamano
2025-06-26  3:34   ` Lidong Yan
2025-06-26 14:15     ` Junio C Hamano
2025-06-27  6:21 ` [PATCH v2 0/2] bloom: enable bloom filter optimization for multiple pathspec elements in revision traversal Lidong Yan
2025-06-28  4:21   ` [PATCH v3 " Lidong Yan
2025-07-04 11:14     ` [PATCH v4 0/4] " Lidong Yan
2025-07-04 11:14       ` [PATCH v4 1/4] bloom: add test helper to return murmur3 hash Lidong Yan
2025-07-04 11:14       ` [PATCH v4 2/4] bloom: rename function operates on bloom_key Lidong Yan
2025-07-04 11:14       ` [PATCH v4 3/4] bloom: replace struct bloom_key * with struct bloom_keyvec Lidong Yan
2025-07-07 11:35         ` Derrick Stolee
2025-07-07 14:14           ` Lidong Yan
2025-07-04 11:14       ` [PATCH v4 4/4] bloom: optimize multiple pathspec items in revision traversal Lidong Yan
2025-07-07 11:43         ` Derrick Stolee
2025-07-07 14:18           ` Lidong Yan
2025-07-07 15:14           ` Junio C Hamano
2025-07-10  8:48       ` [PATCH v5 0/4] bloom: enable bloom filter optimization for multiple pathspec elements " Lidong Yan
2025-07-10  8:48         ` [PATCH v5 1/4] bloom: add test helper to return murmur3 hash Lidong Yan
2025-07-10  8:48         ` [PATCH v5 2/4] bloom: rename function operates on bloom_key Lidong Yan
2025-07-10  8:48         ` [PATCH v5 3/4] bloom: replace struct bloom_key * with struct bloom_keyvec Lidong Yan
2025-07-10 16:17           ` Junio C Hamano
2025-07-11 12:46             ` Lidong Yan
2025-07-11 15:06               ` Junio C Hamano
2025-07-10  8:48         ` [PATCH v5 4/4] bloom: optimize multiple pathspec items in revision traversal Lidong Yan
2025-07-10 13:51           ` [PATCH v5.1 3.5/4] revision: make helper for pathspec to bloom key Derrick Stolee
2025-07-10 15:42             ` Lidong Yan
2025-07-10 13:55           ` Derrick Stolee [this message]
2025-07-10 15:49             ` [PATCH v5.1 4/4] bloom: optimize multiple pathspec items in revision Lidong Yan
2025-07-10 13:49         ` [PATCH v5 0/4] bloom: enable bloom filter optimization for multiple pathspec elements in revision traversal Derrick Stolee
2025-07-12  9:35         ` [PATCH v6 0/5] " Lidong Yan
2025-07-12  9:35           ` [PATCH v6 1/5] bloom: add test helper to return murmur3 hash Lidong Yan
2025-07-12  9:35           ` [PATCH v6 2/5] bloom: rename function operates on bloom_key Lidong Yan
2025-07-12  9:35           ` [PATCH v6 3/5] bloom: replace struct bloom_key * with struct bloom_keyvec Lidong Yan
2025-07-12  9:35           ` [PATCH v6 4/5] revision: make helper for pathspec to bloom keyvec Lidong Yan
2025-07-12  9:35           ` [PATCH v6 5/5] To enable optimize multiple pathspec items in revision traversal, return 0 if all pathspec item is literal in forbid_bloom_filters(). Add for loops to initialize and check each pathspec item's bloom_keyvec when optimization is possible Lidong Yan
2025-07-12  9:47             ` Lidong Yan
2025-07-12  9:51               ` [PATCH v6 5/5] bloom: optimize multiple pathspec items in revision Lidong Yan
2025-07-14 16:51                 ` Derrick Stolee
2025-07-14 17:01                   ` Junio C Hamano
2025-07-15  1:37                     ` Lidong Yan
2025-07-15  2:56                       ` [RESEND][PATCH " Lidong Yan
2025-07-14 16:53           ` [PATCH v6 0/5] bloom: enable bloom filter optimization for multiple pathspec elements in revision traversal Derrick Stolee
2025-07-14 17:02             ` Junio C Hamano
2025-07-15  1:34             ` Lidong Yan
2025-07-15  2:48               ` Derrick Stolee
2025-07-15 15:09                 ` Junio C Hamano
2025-06-28  4:21   ` [PATCH v3 1/2] bloom: replace struct bloom_key * with struct bloom_keyvec Lidong Yan
2025-07-02 15:08     ` Patrick Steinhardt
2025-07-02 15:49       ` Lidong Yan
2025-07-02 18:28       ` Junio C Hamano
2025-07-03  1:41         ` Lidong Yan
2025-06-28  4:21   ` [PATCH v3 2/2] bloom: optimize multiple pathspec items in revision traversal Lidong Yan
2025-06-27  6:21 ` [PATCH v2 1/2] bloom: replace struct bloom_key * with struct bloom_keyvec Lidong Yan
2025-06-27  6:21 ` [PATCH v2 2/2] bloom: optimize multiple pathspec items in revision traversal Lidong Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2619038e-05f5-4af8-bb20-e4e01138f839@gmail.com \
    --to=stolee@gmail.com \
    --cc=502024330056@smail.nju.edu.cn \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=toon@iotcl.com \
    --cc=yldhome2d2@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).