From: "Ezekiel Newren via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: Ezekiel Newren <ezekielnewren@gmail.com>,
Ezekiel Newren <ezekielnewren@gmail.com>
Subject: [PATCH 03/10] xdiff: don't waste time guessing the number of lines
Date: Fri, 02 Jan 2026 18:52:17 +0000 [thread overview]
Message-ID: <53e4840c1653772379dc8d5c883b34717b81ac43.1767379944.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.2156.git.git.1767379944.gitgitgadget@gmail.com>
From: Ezekiel Newren <ezekielnewren@gmail.com>
All lines must be read anyway, so classify them after they're read in.
Also move the memset() into xdl_init_classifier().
Signed-off-by: Ezekiel Newren <ezekielnewren@gmail.com>
---
xdiff/xprepare.c | 52 +++++++++++++++++++-----------------------------
xdiff/xutils.c | 20 -------------------
xdiff/xutils.h | 1 -
3 files changed, 21 insertions(+), 52 deletions(-)
diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index 34c82e4f8e..96a32cc5e9 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -26,8 +26,6 @@
#define XDL_KPDIS_RUN 4
#define XDL_MAX_EQLIMIT 1024
#define XDL_SIMSCAN_WINDOW 100
-#define XDL_GUESS_NLINES1 256
-#define XDL_GUESS_NLINES2 20
#define DISCARD 0
#define KEEP 1
@@ -55,6 +53,8 @@ typedef struct s_xdlclassifier {
static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
+ memset(cf, 0, sizeof(xdlclassifier_t));
+
cf->flags = flags;
cf->hbits = xdl_hashbits((unsigned int) size);
@@ -134,12 +134,12 @@ static void xdl_free_ctx(xdfile_t *xdf)
}
-static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
- xdlclassifier_t *cf, xdfile_t *xdf) {
+static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) {
long bsize;
uint64_t hav;
uint8_t const *blk, *cur, *top, *prev;
xrecord_t *crec;
+ long narec = 8;
xdf->reference_index = NULL;
xdf->changed = NULL;
@@ -152,23 +152,21 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
for (top = blk + bsize; cur < top; ) {
prev = cur;
- hav = xdl_hash_record(&cur, top, xpp->flags);
+ hav = xdl_hash_record(&cur, top, flags);
if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec))
goto abort;
crec = &xdf->recs[xdf->nrec++];
crec->ptr = prev;
crec->size = cur - prev;
crec->line_hash = hav;
- if (xdl_classify_record(pass, cf, crec) < 0)
- goto abort;
}
}
if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2))
goto abort;
- if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
- (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
+ if ((XDF_DIFF_ALG(flags) != XDF_PATIENCE_DIFF) &&
+ (XDF_DIFF_ALG(flags) != XDF_HISTOGRAM_DIFF)) {
if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1))
goto abort;
}
@@ -381,37 +379,29 @@ static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2
int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
xdfenv_t *xe) {
- long enl1, enl2, sample;
xdlclassifier_t cf;
- memset(&cf, 0, sizeof(cf));
-
- /*
- * For histogram diff, we can afford a smaller sample size and
- * thus a poorer estimate of the number of lines, as the hash
- * table (rhash) won't be filled up/grown. The number of lines
- * (nrecs) will be updated correctly anyway by
- * xdl_prepare_ctx().
- */
- sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
- ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
+ if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) {
- enl1 = xdl_guess_lines(mf1, sample) + 1;
- enl2 = xdl_guess_lines(mf2, sample) + 1;
-
- if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
return -1;
+ }
+ if (xdl_prepare_ctx(mf2, &xe->xdf2, xpp->flags) < 0) {
- if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
-
- xdl_free_classifier(&cf);
+ xdl_free_ctx(&xe->xdf1);
return -1;
}
- if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
- xdl_free_ctx(&xe->xdf1);
- xdl_free_classifier(&cf);
+ if (xdl_init_classifier(&cf, xe->xdf1.nrec + xe->xdf2.nrec + 1, xpp->flags) < 0)
return -1;
+
+ for (size_t i = 0; i < xe->xdf1.nrec; i++) {
+ xrecord_t *rec = &xe->xdf1.recs[i];
+ xdl_classify_record(1, &cf, rec);
+ }
+
+ for (size_t i = 0; i < xe->xdf2.nrec; i++) {
+ xrecord_t *rec = &xe->xdf2.recs[i];
+ xdl_classify_record(2, &cf, rec);
}
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
diff --git a/xdiff/xutils.c b/xdiff/xutils.c
index 77ee1ad9c8..b3d51197c1 100644
--- a/xdiff/xutils.c
+++ b/xdiff/xutils.c
@@ -118,26 +118,6 @@ void *xdl_cha_alloc(chastore_t *cha) {
return data;
}
-long xdl_guess_lines(mmfile_t *mf, long sample) {
- long nl = 0, size, tsize = 0;
- char const *data, *cur, *top;
-
- if ((cur = data = xdl_mmfile_first(mf, &size))) {
- for (top = data + size; nl < sample && cur < top; ) {
- nl++;
- if (!(cur = memchr(cur, '\n', top - cur)))
- cur = top;
- else
- cur++;
- }
- tsize += (long) (cur - data);
- }
-
- if (nl && tsize)
- nl = xdl_mmfile_size(mf) / (tsize / nl);
-
- return nl + 1;
-}
int xdl_blankline(const char *line, long size, long flags)
{
diff --git a/xdiff/xutils.h b/xdiff/xutils.h
index 615b4a9d35..d800840dd0 100644
--- a/xdiff/xutils.h
+++ b/xdiff/xutils.h
@@ -31,7 +31,6 @@ int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize,
int xdl_cha_init(chastore_t *cha, long isize, long icount);
void xdl_cha_free(chastore_t *cha);
void *xdl_cha_alloc(chastore_t *cha);
-long xdl_guess_lines(mmfile_t *mf, long sample);
int xdl_blankline(const char *line, long size, long flags);
int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags);
uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top);
--
gitgitgadget
next prev parent reply other threads:[~2026-01-02 18:52 UTC|newest]
Thread overview: 124+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-02 18:52 [PATCH 00/10] Xdiff cleanup part 3 Ezekiel Newren via GitGitGadget
2026-01-02 18:52 ` [PATCH 01/10] ivec: introduce the C side of ivec Ezekiel Newren via GitGitGadget
2026-01-04 5:32 ` Junio C Hamano
2026-01-17 16:06 ` Ezekiel Newren
2026-01-08 14:34 ` Phillip Wood
2026-01-15 15:55 ` Ezekiel Newren
2026-01-16 10:39 ` Phillip Wood
2026-01-16 20:19 ` René Scharfe
2026-01-17 13:55 ` Phillip Wood
2026-01-17 16:04 ` Ezekiel Newren
2026-01-18 14:58 ` René Scharfe
2026-01-17 16:14 ` Ezekiel Newren
2026-01-17 16:16 ` Ezekiel Newren
2026-01-17 17:40 ` Phillip Wood
2026-01-19 5:59 ` Jeff King
2026-01-19 20:21 ` Ezekiel Newren
2026-01-19 20:40 ` Jeff King
2026-01-20 2:36 ` D. Ben Knoble
2026-01-21 21:00 ` Ezekiel Newren
2026-01-21 21:20 ` Jeff King
2026-01-21 21:31 ` Junio C Hamano
2026-01-21 21:45 ` Ezekiel Newren
2026-01-20 13:46 ` Phillip Wood
2026-01-20 14:06 ` Phillip Wood
2026-01-21 21:39 ` Ezekiel Newren
2026-01-28 11:15 ` Phillip Wood
2026-01-16 20:19 ` René Scharfe
2026-01-17 15:58 ` Ezekiel Newren
2026-01-18 14:55 ` René Scharfe
2026-01-02 18:52 ` [PATCH 02/10] xdiff: make classic diff explicit by creating xdl_do_classic_diff() Ezekiel Newren via GitGitGadget
2026-01-20 15:01 ` Phillip Wood
2026-01-21 21:05 ` Ezekiel Newren
2026-01-02 18:52 ` Ezekiel Newren via GitGitGadget [this message]
2026-01-20 15:02 ` [PATCH 03/10] xdiff: don't waste time guessing the number of lines Phillip Wood
2026-01-21 21:12 ` Ezekiel Newren
2026-01-22 10:16 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 04/10] xdiff: let patience and histogram benefit from xdl_trim_ends() Ezekiel Newren via GitGitGadget
2026-01-20 15:02 ` Phillip Wood
2026-01-21 14:49 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 05/10] xdiff: use xdfenv_t in xdl_trim_ends() and xdl_cleanup_records() Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 06/10] xdiff: cleanup xdl_trim_ends() Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 07/10] xdiff: replace xdfile_t.dstart with xdfenv_t.delta_start Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-28 10:51 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 08/10] xdiff: replace xdfile_t.dend with xdfenv_t.delta_end Ezekiel Newren via GitGitGadget
2026-01-02 18:52 ` [PATCH 09/10] xdiff: remove dependence on xdlclassifier from xdl_cleanup_records() Ezekiel Newren via GitGitGadget
2026-01-16 20:19 ` René Scharfe
2026-01-17 16:34 ` Ezekiel Newren
2026-01-18 18:23 ` René Scharfe
2026-01-21 15:01 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 10/10] xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c Ezekiel Newren via GitGitGadget
2026-01-21 15:01 ` Phillip Wood
2026-01-28 10:56 ` Phillip Wood
2026-01-04 2:44 ` [PATCH 00/10] Xdiff cleanup part 3 Junio C Hamano
2026-01-04 6:01 ` Yee Cheng Chin
2026-01-28 14:40 ` Phillip Wood
2026-03-06 23:03 ` Junio C Hamano
2026-03-09 19:06 ` Ezekiel Newren
2026-03-09 23:31 ` Junio C Hamano
2026-03-25 21:11 ` [PATCH v2 0/5] " Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 1/5] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 2/5] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 3/5] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 4/5] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 5/5] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-25 21:58 ` Junio C Hamano
2026-03-26 6:26 ` [PATCH v2 0/5] Xdiff cleanup part 3 SZEDER Gábor
2026-03-27 19:23 ` [PATCH v3 0/6] " Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-27 21:09 ` Junio C Hamano
2026-03-27 23:01 ` Junio C Hamano
2026-03-30 16:00 ` Ezekiel Newren
2026-03-30 19:59 ` Junio C Hamano
2026-03-31 1:29 ` Ezekiel Newren
2026-03-27 19:23 ` [PATCH v3 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 6/6] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-30 16:59 ` [PATCH v4 0/6] Xdiff cleanup part 3 Ezekiel Newren via GitGitGadget
2026-03-30 16:59 ` [PATCH v4 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-30 17:23 ` Ezekiel Newren
2026-03-30 22:53 ` Junio C Hamano
2026-03-30 16:59 ` [PATCH v4 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-03-30 22:59 ` Junio C Hamano
2026-03-30 17:00 ` [PATCH v4 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-30 17:00 ` [PATCH v4 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-31 9:44 ` Phillip Wood
2026-03-31 16:13 ` Junio C Hamano
2026-04-14 21:58 ` Ezekiel Newren
2026-04-14 22:15 ` Junio C Hamano
2026-04-15 13:54 ` Phillip Wood
2026-03-30 17:00 ` [PATCH v4 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-30 23:02 ` Junio C Hamano
2026-03-31 9:44 ` Phillip Wood
2026-03-30 17:00 ` [PATCH v4 6/6] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-31 9:43 ` Phillip Wood
2026-04-01 16:00 ` Phillip Wood
2026-03-30 23:04 ` [PATCH v4 0/6] Xdiff cleanup part 3 Junio C Hamano
2026-03-31 9:45 ` Phillip Wood
2026-04-08 20:26 ` [PATCH v5 " Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-04-14 10:09 ` Phillip Wood
2026-04-08 20:26 ` [PATCH v5 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 6/6] xdiff/xdl_cleanup_records: put braces around the else clause Ezekiel Newren via GitGitGadget
2026-04-08 21:28 ` [PATCH v5 0/6] Xdiff cleanup part 3 Junio C Hamano
2026-04-09 14:01 ` Phillip Wood
2026-04-14 10:08 ` Phillip Wood
2026-04-14 17:06 ` Junio C Hamano
2026-04-29 22:08 ` [PATCH v6 " Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 6/6] xdiff/xdl_cleanup_records: make execution of " Ezekiel Newren via GitGitGadget
2026-04-30 13:35 ` [PATCH v6 0/6] Xdiff cleanup part 3 Phillip Wood
2026-04-30 21:08 ` Ezekiel Newren
2026-05-04 0:59 ` Junio C Hamano
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=53e4840c1653772379dc8d5c883b34717b81ac43.1767379944.git.gitgitgadget@gmail.com \
--to=gitgitgadget@gmail.com \
--cc=ezekielnewren@gmail.com \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.