* [PATCH] xdiff: disable cleanup_records heuristic with --minimal
@ 2025-04-25 15:59 Niels Glodny
2025-04-27 15:04 ` Phillip Wood
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Niels Glodny @ 2025-04-25 15:59 UTC (permalink / raw)
To: git; +Cc: johannes.schindelin, peff, phillip.wood, Niels Glodny
The cleanup_records function marks some lines as changed
before running the actual diff algorithm. For most lines,
this is a good performance optimization, but it also marks
lines that are surrounded by many changed lines as changed
as well. This can cause redundant changes and longer-than-
necessary diffs.
Whether this results in better-looking diffs is subjective.
However, the --minimal flag explicitly requests the shortest
possible diff.
The performance impact of this change is negligible, and it
results in shorter diffs in about 1.3% of diffs in Git's
history.
Signed-off-by: Niels Glodny <n.glodny@campus.lmu.de>
---
t/meson.build | 1 +
t/t4071-diff-minimal.sh | 16 ++++++++++++++++
xdiff/xprepare.c | 22 +++++++++++++---------
3 files changed, 30 insertions(+), 9 deletions(-)
create mode 100755 t/t4071-diff-minimal.sh
diff --git a/t/meson.build b/t/meson.build
index bfb744e886..8f2e9d2c50 100644
--- a/t/meson.build
+++ b/t/meson.build
@@ -501,6 +501,7 @@ integration_tests = [
't4068-diff-symmetric-merge-base.sh',
't4069-remerge-diff.sh',
't4070-diff-pairs.sh',
+ 't4071-diff-minimal.sh',
't4100-apply-stat.sh',
't4101-apply-nonl.sh',
't4102-apply-rename.sh',
diff --git a/t/t4071-diff-minimal.sh b/t/t4071-diff-minimal.sh
new file mode 100755
index 0000000000..3ad759dab4
--- /dev/null
+++ b/t/t4071-diff-minimal.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+test_description='minimal diff algorithm'
+
+. ./test-lib.sh
+
+test_expect_success 'minimal diff should not mark changes between changed lines' '
+ printf "x\nx\nx\nx\n" >pre &&
+ printf "x\nx\nx\nA\nB\nC\nD\nx\nE\nF\nG\n" >post &&
+ test_must_fail git diff --no-index \
+ --minimal pre post >diff &&
+ ! grep "+x" diff &&
+ ! grep "-x" diff
+'
+
+test_done
diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index c84549f6c5..cb0b6c9fd6 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -61,9 +61,11 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
xdlclassifier_t *cf, xdfile_t *xdf);
static void xdl_free_ctx(xdfile_t *xdf);
static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
-static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
+static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1,
+ xdfile_t *xdf2, int need_min);
static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2);
-static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
+static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1,
+ xdfile_t *xdf2, int need_min);
@@ -279,7 +281,8 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
- xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
+ xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2,
+ (xpp->flags & XDF_NEED_MINIMAL) != 0) < 0) {
xdl_free_ctx(&xe->xdf2);
xdl_free_ctx(&xe->xdf1);
@@ -363,7 +366,8 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
* matches on the other file. Also, lines that have multiple matches
* might be potentially discarded if they happear in a run of discardable.
*/
-static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
+static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1,
+ xdfile_t *xdf2, int need_min) {
long i, nm, nreff, mlim;
xrecord_t **recs;
xdlclass_t *rcrec;
@@ -379,7 +383,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
rcrec = cf->rcrecs[(*recs)->ha];
nm = rcrec ? rcrec->len2 : 0;
- dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
+ dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
}
if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
@@ -387,7 +391,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
rcrec = cf->rcrecs[(*recs)->ha];
nm = rcrec ? rcrec->len1 : 0;
- dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
+ dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
}
for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
@@ -449,10 +453,10 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
}
-static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
-
+static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1,
+ xdfile_t *xdf2, int need_min) {
if (xdl_trim_ends(xdf1, xdf2) < 0 ||
- xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
+ xdl_cleanup_records(cf, xdf1, xdf2, need_min) < 0) {
return -1;
}
base-commit: f65182a99e545d2f2bc22e6c1c2da192133b16a3
--
2.34.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH] xdiff: disable cleanup_records heuristic with --minimal
2025-04-25 15:59 [PATCH] xdiff: disable cleanup_records heuristic with --minimal Niels Glodny
@ 2025-04-27 15:04 ` Phillip Wood
2025-04-27 21:44 ` Niels Glodny
2025-04-27 22:06 ` [PATCH v2] " Niels Glodny
2025-04-29 14:09 ` [PATCH v3] " Niels Glodny
2 siblings, 1 reply; 9+ messages in thread
From: Phillip Wood @ 2025-04-27 15:04 UTC (permalink / raw)
To: Niels Glodny, git; +Cc: johannes.schindelin, peff, phillip.wood
Hi Niels
On 25/04/2025 16:59, Niels Glodny wrote:
> The cleanup_records function marks some lines as changed
> before running the actual diff algorithm. For most lines,
> this is a good performance optimization, but it also marks
> lines that are surrounded by many changed lines as changed
> as well. This can cause redundant changes and longer-than-
> necessary diffs.
Nicely explained. We normally wrap commit messages at 72 characters
> Whether this results in better-looking diffs is subjective.
> However, the --minimal flag explicitly requests the shortest
> possible diff.
Looking at the diff for 2ff58dec493 (refs: introduce function to batch
refname availability checks, 2025-03-12) I'd say it is definitely less
readable with this change but some of the changes in 320f2061b63 (t0602:
use subshell to ensure working directory unchanged, 2025-02-28) are more
readable. Anyway as you say if we promise to find the minimal diff then
that is what we should do.
> The performance impact of this change is negligible, and it
> results in shorter diffs in about 1.3% of diffs in Git's
> history.
Have you got any numbers for the performance change?
I think the premise of this patch is sound, I've left a couple of
comments below as I think we can simplify the code changes.
> diff --git a/t/t4071-diff-minimal.sh b/t/t4071-diff-minimal.sh
> new file mode 100755
> index 0000000000..3ad759dab4
> --- /dev/null
> +++ b/t/t4071-diff-minimal.sh
> @@ -0,0 +1,16 @@
> +#!/bin/sh
> +
> +test_description='minimal diff algorithm'
> +
> +. ./test-lib.sh
> +
> +test_expect_success 'minimal diff should not mark changes between changed lines' '
> + printf "x\nx\nx\nx\n" >pre &&
> + printf "x\nx\nx\nA\nB\nC\nD\nx\nE\nF\nG\n" >post &&
> + test_must_fail git diff --no-index \
> + --minimal pre post >diff &&
> + ! grep "+x" diff &&
> + ! grep "-x" diff
> +'
Thanks for taking the trouble to add a new test. We have a few test
helper functions which we could use here.
test_write_lines x x x x >pre &&
test_write_lines x x x A B C D x E F G >post &&
test_expect_code 1 git diff --no-index --minimal pre post >diff &&
test_grep ! ^[-+]x diff
test_expect_code ensures that the non-zero exit code is from the files
not matching rather than another error like an invalid option or missing
file. test_grep will display the file if there are matches which helps
when debugging test failures.
> [...]> -static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t
*xdf1, xdfile_t *xdf2) {
> +static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1,
> + xdfile_t *xdf2, int need_min) {
> long i, nm, nreff, mlim;
> xrecord_t **recs;
> xdlclass_t *rcrec;
> @@ -379,7 +383,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
xdlclassifier_t carries a copy of the flag that were interested in so I
think at the start of xdl_cleanup_records we can add
int need_min = !!(cf->flags & XDF_NEED_MINIMAL);
and then we don't need to change the signature.
> for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
> rcrec = cf->rcrecs[(*recs)->ha];
> nm = rcrec ? rcrec->len2 : 0;
> - dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
> + dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
If we want a minimal diff then we force dis1[i] to be 1 so that this
line will never be marked as changed before searching for the longest
common sequence. Well spotted.
Best Wishes
Phillip
> }
>
> if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
> @@ -387,7 +391,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
> for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
> rcrec = cf->rcrecs[(*recs)->ha];
> nm = rcrec ? rcrec->len1 : 0;
> - dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
> + dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
> }
>
> for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
> @@ -449,10 +453,10 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
> }
>
>
> -static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
> -
> +static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1,
> + xdfile_t *xdf2, int need_min) {
> if (xdl_trim_ends(xdf1, xdf2) < 0 ||
> - xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
> + xdl_cleanup_records(cf, xdf1, xdf2, need_min) < 0) {
>
> return -1;
> }
>
> base-commit: f65182a99e545d2f2bc22e6c1c2da192133b16a3
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] xdiff: disable cleanup_records heuristic with --minimal
2025-04-27 15:04 ` Phillip Wood
@ 2025-04-27 21:44 ` Niels Glodny
2025-04-28 17:05 ` Junio C Hamano
0 siblings, 1 reply; 9+ messages in thread
From: Niels Glodny @ 2025-04-27 21:44 UTC (permalink / raw)
To: phillip.wood, Niels Glodny, git; +Cc: johannes.schindelin, peff
Hi Phillip,
thank you for your detailed comments.
> Have you got any numbers for the performance change?
I have been using "git log -p -3000 --minimal > /dev/null", as in p4000-diff-algorithms.sh. With this patch, I get
Time (mean ± σ): 2.363 s ± 0.023 s (25 runs)
Without this patch, I get
Time (mean ± σ): 2.362 s ± 0.035 s (25 runs)
So the difference is well within the margin of error. It doesn't look like it has any measurable impact on performance.
> I think the premise of this patch is sound, I've left a couple of comments below as I think we can simplify the code changes.
Thanks a lot! I'll apply your suggestions and send a revised version shortly.
Regards,
Niels
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH v2] xdiff: disable cleanup_records heuristic with --minimal
2025-04-25 15:59 [PATCH] xdiff: disable cleanup_records heuristic with --minimal Niels Glodny
2025-04-27 15:04 ` Phillip Wood
@ 2025-04-27 22:06 ` Niels Glodny
2025-04-29 9:00 ` Phillip Wood
2025-04-29 14:09 ` [PATCH v3] " Niels Glodny
2 siblings, 1 reply; 9+ messages in thread
From: Niels Glodny @ 2025-04-27 22:06 UTC (permalink / raw)
To: git; +Cc: Niels Glodny, johannes.schindelin, peff, phillip.wood
The cleanup_records function marks some lines as changed before running
the actual diff algorithm. For most lines, this is a good performance
optimization, but it also marks lines that are surrounded by many
changed lines as changed as well. This can cause redundant changes and
longer-than-necessary diffs.
Whether this results in better-looking diffs is subjective. However, the
--minimal flag explicitly requests the shortest possible diff.
A performance impact of this is not measurable, and it results in
shorter diffs in about 1.3% of diffs in Git's history.
Signed-off-by: Niels Glodny <n.glodny@campus.lmu.de>
---
t/meson.build | 1 +
t/t4071-diff-minimal.sh | 14 ++++++++++++++
xdiff/xprepare.c | 5 +++--
3 files changed, 18 insertions(+), 2 deletions(-)
create mode 100755 t/t4071-diff-minimal.sh
diff --git a/t/meson.build b/t/meson.build
index bfb744e886..8f2e9d2c50 100644
--- a/t/meson.build
+++ b/t/meson.build
@@ -501,6 +501,7 @@ integration_tests = [
't4068-diff-symmetric-merge-base.sh',
't4069-remerge-diff.sh',
't4070-diff-pairs.sh',
+ 't4071-diff-minimal.sh',
't4100-apply-stat.sh',
't4101-apply-nonl.sh',
't4102-apply-rename.sh',
diff --git a/t/t4071-diff-minimal.sh b/t/t4071-diff-minimal.sh
new file mode 100755
index 0000000000..4c484dadfb
--- /dev/null
+++ b/t/t4071-diff-minimal.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+test_description='minimal diff algorithm'
+
+. ./test-lib.sh
+
+test_expect_success 'minimal diff should not mark changes between changed lines' '
+ test_write_lines x x x x >pre &&
+ test_write_lines x x x A B C D x E F G >post &&
+ test_expect_code 1 git diff --no-index --minimal pre post >diff &&
+ test_grep ! ^[+-]x diff
+'
+
+test_done
diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index c84549f6c5..e1d4017b2d 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -368,6 +368,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
xrecord_t **recs;
xdlclass_t *rcrec;
char *dis, *dis1, *dis2;
+ int need_min = !!(cf->flags & XDF_NEED_MINIMAL);
if (!XDL_CALLOC_ARRAY(dis, xdf1->nrec + xdf2->nrec + 2))
return -1;
@@ -379,7 +380,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
rcrec = cf->rcrecs[(*recs)->ha];
nm = rcrec ? rcrec->len2 : 0;
- dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
+ dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
}
if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
@@ -387,7 +388,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
rcrec = cf->rcrecs[(*recs)->ha];
nm = rcrec ? rcrec->len1 : 0;
- dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
+ dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
}
for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
base-commit: f65182a99e545d2f2bc22e6c1c2da192133b16a3
--
2.34.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH] xdiff: disable cleanup_records heuristic with --minimal
2025-04-27 21:44 ` Niels Glodny
@ 2025-04-28 17:05 ` Junio C Hamano
0 siblings, 0 replies; 9+ messages in thread
From: Junio C Hamano @ 2025-04-28 17:05 UTC (permalink / raw)
To: Niels Glodny; +Cc: phillip.wood, git, johannes.schindelin, peff
Niels Glodny <n.glodny@campus.lmu.de> writes:
> Hi Phillip,
>
> thank you for your detailed comments.
>
>> Have you got any numbers for the performance change?
>
> I have been using "git log -p -3000 --minimal > /dev/null", as in
> p4000-diff-algorithms.sh. With this patch, I get
>
> Time (mean ± σ): 2.363 s ± 0.023 s (25 runs)
>
> Without this patch, I get
>
> Time (mean ± σ): 2.362 s ± 0.035 s (25 runs)
>
> So the difference is well within the margin of error. It doesn't
> look like it has any measurable impact on performance.
That is an excellent observation and result. It should be added to
the proposed commit log message, if not already. The commit log is
where you answer questions, similar to what were raised during
review by your reviewers, that future readers of "git log -p" would
have about your change. For them you won't be easily available to
answer their questions, and that is why we stress on the need for
well-written commit log messages.
Thanks.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] xdiff: disable cleanup_records heuristic with --minimal
2025-04-27 22:06 ` [PATCH v2] " Niels Glodny
@ 2025-04-29 9:00 ` Phillip Wood
0 siblings, 0 replies; 9+ messages in thread
From: Phillip Wood @ 2025-04-29 9:00 UTC (permalink / raw)
To: Niels Glodny, git; +Cc: johannes.schindelin, peff, phillip.wood
Hi Niels
On 27/04/2025 23:06, Niels Glodny wrote:
> The cleanup_records function marks some lines as changed before running
> the actual diff algorithm. For most lines, this is a good performance
> optimization, but it also marks lines that are surrounded by many
> changed lines as changed as well. This can cause redundant changes and
> longer-than-necessary diffs.
>
> Whether this results in better-looking diffs is subjective. However, the
> --minimal flag explicitly requests the shortest possible diff.
>
> A performance impact of this is not measurable, and it results in
> shorter diffs in about 1.3% of diffs in Git's history.
Thanks for re-rolling, the changes all look good to me. As Junio said it
would be helpful to put the performance numbers in the commit message.
Thanks
Phillip
> Signed-off-by: Niels Glodny <n.glodny@campus.lmu.de>
> ---
> t/meson.build | 1 +
> t/t4071-diff-minimal.sh | 14 ++++++++++++++
> xdiff/xprepare.c | 5 +++--
> 3 files changed, 18 insertions(+), 2 deletions(-)
> create mode 100755 t/t4071-diff-minimal.sh
>
> diff --git a/t/meson.build b/t/meson.build
> index bfb744e886..8f2e9d2c50 100644
> --- a/t/meson.build
> +++ b/t/meson.build
> @@ -501,6 +501,7 @@ integration_tests = [
> 't4068-diff-symmetric-merge-base.sh',
> 't4069-remerge-diff.sh',
> 't4070-diff-pairs.sh',
> + 't4071-diff-minimal.sh',
> 't4100-apply-stat.sh',
> 't4101-apply-nonl.sh',
> 't4102-apply-rename.sh',
> diff --git a/t/t4071-diff-minimal.sh b/t/t4071-diff-minimal.sh
> new file mode 100755
> index 0000000000..4c484dadfb
> --- /dev/null
> +++ b/t/t4071-diff-minimal.sh
> @@ -0,0 +1,14 @@
> +#!/bin/sh
> +
> +test_description='minimal diff algorithm'
> +
> +. ./test-lib.sh
> +
> +test_expect_success 'minimal diff should not mark changes between changed lines' '
> + test_write_lines x x x x >pre &&
> + test_write_lines x x x A B C D x E F G >post &&
> + test_expect_code 1 git diff --no-index --minimal pre post >diff &&
> + test_grep ! ^[+-]x diff
> +'
> +
> +test_done
> diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
> index c84549f6c5..e1d4017b2d 100644
> --- a/xdiff/xprepare.c
> +++ b/xdiff/xprepare.c
> @@ -368,6 +368,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
> xrecord_t **recs;
> xdlclass_t *rcrec;
> char *dis, *dis1, *dis2;
> + int need_min = !!(cf->flags & XDF_NEED_MINIMAL);
>
> if (!XDL_CALLOC_ARRAY(dis, xdf1->nrec + xdf2->nrec + 2))
> return -1;
> @@ -379,7 +380,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
> for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
> rcrec = cf->rcrecs[(*recs)->ha];
> nm = rcrec ? rcrec->len2 : 0;
> - dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
> + dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
> }
>
> if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
> @@ -387,7 +388,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
> for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
> rcrec = cf->rcrecs[(*recs)->ha];
> nm = rcrec ? rcrec->len1 : 0;
> - dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
> + dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
> }
>
> for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
>
> base-commit: f65182a99e545d2f2bc22e6c1c2da192133b16a3
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH v3] xdiff: disable cleanup_records heuristic with --minimal
2025-04-25 15:59 [PATCH] xdiff: disable cleanup_records heuristic with --minimal Niels Glodny
2025-04-27 15:04 ` Phillip Wood
2025-04-27 22:06 ` [PATCH v2] " Niels Glodny
@ 2025-04-29 14:09 ` Niels Glodny
2025-05-06 13:21 ` Phillip Wood
2 siblings, 1 reply; 9+ messages in thread
From: Niels Glodny @ 2025-04-29 14:09 UTC (permalink / raw)
To: git; +Cc: Niels Glodny, phillip.wood, johannes.schindelin, peff, gitster
The cleanup_records function marks some lines as changed before running
the actual diff algorithm. For most lines, this is a good performance
optimization, but it also marks lines that are surrounded by many
changed lines as changed as well. This can cause redundant changes and
longer-than-necessary diffs.
Whether this results in better-looking diffs is subjective. However, the
--minimal flag explicitly requests the shortest possible diff.
The change results in shorter diffs in about 1.3% of all diffs in Git's
history. Performance wise, I have measured the impact on
"git log -p -3000 --minimal > /dev/null". With this change, I get
Time (mean ± σ): 2.363 s ± 0.023 s (25 runs)
and without this patch I measured
Time (mean ± σ): 2.362 s ± 0.035 s (25 runs).
As the difference is well within the margin of error, this does not seem
to have an impact on performance.
Signed-off-by: Niels Glodny <n.glodny@campus.lmu.de>
---
t/meson.build | 1 +
t/t4071-diff-minimal.sh | 14 ++++++++++++++
xdiff/xprepare.c | 5 +++--
3 files changed, 18 insertions(+), 2 deletions(-)
create mode 100755 t/t4071-diff-minimal.sh
diff --git a/t/meson.build b/t/meson.build
index bfb744e886..8f2e9d2c50 100644
--- a/t/meson.build
+++ b/t/meson.build
@@ -501,6 +501,7 @@ integration_tests = [
't4068-diff-symmetric-merge-base.sh',
't4069-remerge-diff.sh',
't4070-diff-pairs.sh',
+ 't4071-diff-minimal.sh',
't4100-apply-stat.sh',
't4101-apply-nonl.sh',
't4102-apply-rename.sh',
diff --git a/t/t4071-diff-minimal.sh b/t/t4071-diff-minimal.sh
new file mode 100755
index 0000000000..4c484dadfb
--- /dev/null
+++ b/t/t4071-diff-minimal.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+test_description='minimal diff algorithm'
+
+. ./test-lib.sh
+
+test_expect_success 'minimal diff should not mark changes between changed lines' '
+ test_write_lines x x x x >pre &&
+ test_write_lines x x x A B C D x E F G >post &&
+ test_expect_code 1 git diff --no-index --minimal pre post >diff &&
+ test_grep ! ^[+-]x diff
+'
+
+test_done
diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index c84549f6c5..e1d4017b2d 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -368,6 +368,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
xrecord_t **recs;
xdlclass_t *rcrec;
char *dis, *dis1, *dis2;
+ int need_min = !!(cf->flags & XDF_NEED_MINIMAL);
if (!XDL_CALLOC_ARRAY(dis, xdf1->nrec + xdf2->nrec + 2))
return -1;
@@ -379,7 +380,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
rcrec = cf->rcrecs[(*recs)->ha];
nm = rcrec ? rcrec->len2 : 0;
- dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
+ dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
}
if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
@@ -387,7 +388,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
rcrec = cf->rcrecs[(*recs)->ha];
nm = rcrec ? rcrec->len1 : 0;
- dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
+ dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
}
for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
base-commit: f65182a99e545d2f2bc22e6c1c2da192133b16a3
--
2.34.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH v3] xdiff: disable cleanup_records heuristic with --minimal
2025-04-29 14:09 ` [PATCH v3] " Niels Glodny
@ 2025-05-06 13:21 ` Phillip Wood
2025-05-06 22:50 ` Junio C Hamano
0 siblings, 1 reply; 9+ messages in thread
From: Phillip Wood @ 2025-05-06 13:21 UTC (permalink / raw)
To: Niels Glodny, git; +Cc: phillip.wood, johannes.schindelin, peff, gitster
Hi Niels
On 29/04/2025 15:09, Niels Glodny wrote:
> The cleanup_records function marks some lines as changed before running
> the actual diff algorithm. For most lines, this is a good performance
> optimization, but it also marks lines that are surrounded by many
> changed lines as changed as well. This can cause redundant changes and
> longer-than-necessary diffs.
>
> Whether this results in better-looking diffs is subjective. However, the
> --minimal flag explicitly requests the shortest possible diff.
>
> The change results in shorter diffs in about 1.3% of all diffs in Git's
> history. Performance wise, I have measured the impact on
> "git log -p -3000 --minimal > /dev/null". With this change, I get
> Time (mean ± σ): 2.363 s ± 0.023 s (25 runs)
> and without this patch I measured
> Time (mean ± σ): 2.362 s ± 0.035 s (25 runs).
> As the difference is well within the margin of error, this does not seem
> to have an impact on performance.
Thanks for adding the performance information, this version looks good
to me.
Best Wishes
Phillip
> Signed-off-by: Niels Glodny <n.glodny@campus.lmu.de>
> ---
> t/meson.build | 1 +
> t/t4071-diff-minimal.sh | 14 ++++++++++++++
> xdiff/xprepare.c | 5 +++--
> 3 files changed, 18 insertions(+), 2 deletions(-)
> create mode 100755 t/t4071-diff-minimal.sh
>
> diff --git a/t/meson.build b/t/meson.build
> index bfb744e886..8f2e9d2c50 100644
> --- a/t/meson.build
> +++ b/t/meson.build
> @@ -501,6 +501,7 @@ integration_tests = [
> 't4068-diff-symmetric-merge-base.sh',
> 't4069-remerge-diff.sh',
> 't4070-diff-pairs.sh',
> + 't4071-diff-minimal.sh',
> 't4100-apply-stat.sh',
> 't4101-apply-nonl.sh',
> 't4102-apply-rename.sh',
> diff --git a/t/t4071-diff-minimal.sh b/t/t4071-diff-minimal.sh
> new file mode 100755
> index 0000000000..4c484dadfb
> --- /dev/null
> +++ b/t/t4071-diff-minimal.sh
> @@ -0,0 +1,14 @@
> +#!/bin/sh
> +
> +test_description='minimal diff algorithm'
> +
> +. ./test-lib.sh
> +
> +test_expect_success 'minimal diff should not mark changes between changed lines' '
> + test_write_lines x x x x >pre &&
> + test_write_lines x x x A B C D x E F G >post &&
> + test_expect_code 1 git diff --no-index --minimal pre post >diff &&
> + test_grep ! ^[+-]x diff
> +'
> +
> +test_done
> diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
> index c84549f6c5..e1d4017b2d 100644
> --- a/xdiff/xprepare.c
> +++ b/xdiff/xprepare.c
> @@ -368,6 +368,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
> xrecord_t **recs;
> xdlclass_t *rcrec;
> char *dis, *dis1, *dis2;
> + int need_min = !!(cf->flags & XDF_NEED_MINIMAL);
>
> if (!XDL_CALLOC_ARRAY(dis, xdf1->nrec + xdf2->nrec + 2))
> return -1;
> @@ -379,7 +380,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
> for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
> rcrec = cf->rcrecs[(*recs)->ha];
> nm = rcrec ? rcrec->len2 : 0;
> - dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
> + dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
> }
>
> if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
> @@ -387,7 +388,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
> for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
> rcrec = cf->rcrecs[(*recs)->ha];
> nm = rcrec ? rcrec->len1 : 0;
> - dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
> + dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
> }
>
> for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
>
> base-commit: f65182a99e545d2f2bc22e6c1c2da192133b16a3
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v3] xdiff: disable cleanup_records heuristic with --minimal
2025-05-06 13:21 ` Phillip Wood
@ 2025-05-06 22:50 ` Junio C Hamano
0 siblings, 0 replies; 9+ messages in thread
From: Junio C Hamano @ 2025-05-06 22:50 UTC (permalink / raw)
To: Phillip Wood; +Cc: Niels Glodny, git, phillip.wood, johannes.schindelin, peff
Phillip Wood <phillip.wood123@gmail.com> writes:
> Hi Niels
>
> On 29/04/2025 15:09, Niels Glodny wrote:
>> The cleanup_records function marks some lines as changed before running
>> the actual diff algorithm. For most lines, this is a good performance
>> optimization, but it also marks lines that are surrounded by many
>> changed lines as changed as well. This can cause redundant changes and
>> longer-than-necessary diffs.
>> Whether this results in better-looking diffs is subjective. However,
>> the
>> --minimal flag explicitly requests the shortest possible diff.
>> The change results in shorter diffs in about 1.3% of all diffs in
>> Git's
>> history. Performance wise, I have measured the impact on
>> "git log -p -3000 --minimal > /dev/null". With this change, I get
>> Time (mean ± σ): 2.363 s ± 0.023 s (25 runs)
>> and without this patch I measured
>> Time (mean ± σ): 2.362 s ± 0.035 s (25 runs).
>> As the difference is well within the margin of error, this does not seem
>> to have an impact on performance.
>
> Thanks for adding the performance information, this version looks good
> to me.
>
> Best Wishes
>
> Phillip
Yup, thanks, both of you.
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2025-05-06 22:50 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-25 15:59 [PATCH] xdiff: disable cleanup_records heuristic with --minimal Niels Glodny
2025-04-27 15:04 ` Phillip Wood
2025-04-27 21:44 ` Niels Glodny
2025-04-28 17:05 ` Junio C Hamano
2025-04-27 22:06 ` [PATCH v2] " Niels Glodny
2025-04-29 9:00 ` Phillip Wood
2025-04-29 14:09 ` [PATCH v3] " Niels Glodny
2025-05-06 13:21 ` Phillip Wood
2025-05-06 22:50 ` Junio C Hamano
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).