From: Phillip Wood <phillip.wood123@gmail.com>
To: Ezekiel Newren via GitGitGadget <gitgitgadget@gmail.com>,
git@vger.kernel.org
Cc: Ezekiel Newren <ezekielnewren@gmail.com>
Subject: Re: [PATCH 09/10] xdiff: remove dependence on xdlclassifier from xdl_cleanup_records()
Date: Wed, 21 Jan 2026 15:01:12 +0000 [thread overview]
Message-ID: <99b28295-fb14-4f4d-98d9-2caa9be88e33@gmail.com> (raw)
In-Reply-To: <f9b10e71d23f8b4fa34dcffb371cf5a173760409.1767379944.git.gitgitgadget@gmail.com>
Hi Ezekiel
On 02/01/2026 18:52, Ezekiel Newren via GitGitGadget wrote:
> From: Ezekiel Newren <ezekielnewren@gmail.com>
>
> Disentangle xdl_cleanup_records() from the classifier so that it can be
> moved from xprepare.c into xdiffi.c.
>
> The classic diff is the only algorithm that needs to count the number
> of times each line occurs in each file. Make xdl_cleanup_records()
> count the number of lines instead of the classifier so it won't slow
> down patience or histogram.
Have you measured the speed up that this gives? It looks like it saves
very little work for the patience or histogram algorithms and means we
now make a second pass over the data in the myers case. If there is a
reason to do this related to the rust conversion then that might be a
more convincing argument. As Rene has said already this isn't a
particularly interesting demonstration of struct IVec - it would be nice
to see more of the API exercised.
Thanks
Phillip
> Signed-off-by: Ezekiel Newren <ezekielnewren@gmail.com>
> ---
> xdiff/xprepare.c | 52 +++++++++++++++++++++++++++++++++---------------
> xdiff/xtypes.h | 1 +
> 2 files changed, 37 insertions(+), 16 deletions(-)
>
> diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
> index d3cdb6ac02..b53a3b80c4 100644
> --- a/xdiff/xprepare.c
> +++ b/xdiff/xprepare.c
> @@ -21,6 +21,7 @@
> */
>
> #include "xinclude.h"
> +#include "compat/ivec.h"
>
>
> #define XDL_KPDIS_RUN 4
> @@ -35,7 +36,6 @@ typedef struct s_xdlclass {
> struct s_xdlclass *next;
> xrecord_t rec;
> long idx;
> - long len1, len2;
> } xdlclass_t;
>
> typedef struct s_xdlclassifier {
> @@ -92,7 +92,7 @@ static void xdl_free_classifier(xdlclassifier_t *cf) {
> }
>
>
> -static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t *rec) {
> +static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t *rec) {
> size_t hi;
> xdlclass_t *rcrec;
>
> @@ -113,13 +113,10 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t
> return -1;
> cf->rcrecs[rcrec->idx] = rcrec;
> rcrec->rec = *rec;
> - rcrec->len1 = rcrec->len2 = 0;
> rcrec->next = cf->rchash[hi];
> cf->rchash[hi] = rcrec;
> }
>
> - (pass == 1) ? rcrec->len1++ : rcrec->len2++;
> -
> rec->minimal_perfect_hash = (size_t)rcrec->idx;
>
> return 0;
> @@ -253,22 +250,44 @@ static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
> return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
> }
>
> +struct xoccurrence
> +{
> + size_t file1, file2;
> +};
> +
> +
> +DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence);
> +
>
> /*
> * Try to reduce the problem complexity, discard records that have no
> * matches on the other file. Also, lines that have multiple matches
> * might be potentially discarded if they appear in a run of discardable.
> */
> -static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) {
> - long i, nm, mlim;
> +static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) {
> + long i;
> + size_t nm, mlim;
> xrecord_t *recs;
> - xdlclass_t *rcrec;
> uint8_t *action1 = NULL, *action2 = NULL;
> - bool need_min = !!(cf->flags & XDF_NEED_MINIMAL);
> + struct IVec_xoccurrence occ;
> + bool need_min = !!(flags & XDF_NEED_MINIMAL);
> int ret = 0;
> ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end;
> ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end;
>
> + IVEC_INIT(occ);
> + ivec_zero(&occ, xe->mph_size);
> +
> + for (size_t j = 0; j < xe->xdf1.nrec; j++) {
> + size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash;
> + occ.ptr[mph1].file1 += 1;
> + }
> +
> + for (size_t j = 0; j < xe->xdf2.nrec; j++) {
> + size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash;
> + occ.ptr[mph2].file2 += 1;
> + }
> +
> /*
> * Create temporary arrays that will help us decide if
> * changed[i] should remain false, or become true.
> @@ -288,16 +307,14 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) {
> if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT)
> mlim = XDL_MAX_EQLIMIT;
> for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) {
> - rcrec = cf->rcrecs[recs->minimal_perfect_hash];
> - nm = rcrec ? rcrec->len2 : 0;
> + nm = occ.ptr[recs->minimal_perfect_hash].file2;
> action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
> }
>
> if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT)
> mlim = XDL_MAX_EQLIMIT;
> for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) {
> - rcrec = cf->rcrecs[recs->minimal_perfect_hash];
> - nm = rcrec ? rcrec->len1 : 0;
> + nm = occ.ptr[recs->minimal_perfect_hash].file1;
> action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
> }
>
> @@ -332,6 +349,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfenv_t *xe) {
> cleanup:
> xdl_free(action1);
> xdl_free(action2);
> + ivec_free(&occ);
>
> return ret;
> }
> @@ -387,18 +405,20 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
>
> for (size_t i = 0; i < xe->xdf1.nrec; i++) {
> xrecord_t *rec = &xe->xdf1.recs[i];
> - xdl_classify_record(1, &cf, rec);
> + xdl_classify_record(&cf, rec);
> }
>
> for (size_t i = 0; i < xe->xdf2.nrec; i++) {
> xrecord_t *rec = &xe->xdf2.recs[i];
> - xdl_classify_record(2, &cf, rec);
> + xdl_classify_record(&cf, rec);
> }
>
> + xe->mph_size = cf.count;
> +
> xdl_trim_ends(xe);
> if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
> (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
> - xdl_cleanup_records(&cf, xe) < 0) {
> + xdl_cleanup_records(xe, xpp->flags) < 0) {
>
> xdl_free_ctx(&xe->xdf2);
> xdl_free_ctx(&xe->xdf1);
> diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h
> index a939396064..2528bd37e8 100644
> --- a/xdiff/xtypes.h
> +++ b/xdiff/xtypes.h
> @@ -56,6 +56,7 @@ typedef struct s_xdfile {
> typedef struct s_xdfenv {
> xdfile_t xdf1, xdf2;
> size_t delta_start, delta_end;
> + size_t mph_size;
> } xdfenv_t;
>
>
next prev parent reply other threads:[~2026-01-21 15:01 UTC|newest]
Thread overview: 78+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-02 18:52 [PATCH 00/10] Xdiff cleanup part 3 Ezekiel Newren via GitGitGadget
2026-01-02 18:52 ` [PATCH 01/10] ivec: introduce the C side of ivec Ezekiel Newren via GitGitGadget
2026-01-04 5:32 ` Junio C Hamano
2026-01-17 16:06 ` Ezekiel Newren
2026-01-08 14:34 ` Phillip Wood
2026-01-15 15:55 ` Ezekiel Newren
2026-01-16 10:39 ` Phillip Wood
2026-01-16 20:19 ` René Scharfe
2026-01-17 13:55 ` Phillip Wood
2026-01-17 16:04 ` Ezekiel Newren
2026-01-18 14:58 ` René Scharfe
2026-01-17 16:14 ` Ezekiel Newren
2026-01-17 16:16 ` Ezekiel Newren
2026-01-17 17:40 ` Phillip Wood
2026-01-19 5:59 ` Jeff King
2026-01-19 20:21 ` Ezekiel Newren
2026-01-19 20:40 ` Jeff King
2026-01-20 2:36 ` D. Ben Knoble
2026-01-21 21:00 ` Ezekiel Newren
2026-01-21 21:20 ` Jeff King
2026-01-21 21:31 ` Junio C Hamano
2026-01-21 21:45 ` Ezekiel Newren
2026-01-20 13:46 ` Phillip Wood
2026-01-20 14:06 ` Phillip Wood
2026-01-21 21:39 ` Ezekiel Newren
2026-01-28 11:15 ` Phillip Wood
2026-01-16 20:19 ` René Scharfe
2026-01-17 15:58 ` Ezekiel Newren
2026-01-18 14:55 ` René Scharfe
2026-01-02 18:52 ` [PATCH 02/10] xdiff: make classic diff explicit by creating xdl_do_classic_diff() Ezekiel Newren via GitGitGadget
2026-01-20 15:01 ` Phillip Wood
2026-01-21 21:05 ` Ezekiel Newren
2026-01-02 18:52 ` [PATCH 03/10] xdiff: don't waste time guessing the number of lines Ezekiel Newren via GitGitGadget
2026-01-20 15:02 ` Phillip Wood
2026-01-21 21:12 ` Ezekiel Newren
2026-01-22 10:16 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 04/10] xdiff: let patience and histogram benefit from xdl_trim_ends() Ezekiel Newren via GitGitGadget
2026-01-20 15:02 ` Phillip Wood
2026-01-21 14:49 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 05/10] xdiff: use xdfenv_t in xdl_trim_ends() and xdl_cleanup_records() Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 06/10] xdiff: cleanup xdl_trim_ends() Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 07/10] xdiff: replace xdfile_t.dstart with xdfenv_t.delta_start Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-28 10:51 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 08/10] xdiff: replace xdfile_t.dend with xdfenv_t.delta_end Ezekiel Newren via GitGitGadget
2026-01-02 18:52 ` [PATCH 09/10] xdiff: remove dependence on xdlclassifier from xdl_cleanup_records() Ezekiel Newren via GitGitGadget
2026-01-16 20:19 ` René Scharfe
2026-01-17 16:34 ` Ezekiel Newren
2026-01-18 18:23 ` René Scharfe
2026-01-21 15:01 ` Phillip Wood [this message]
2026-01-02 18:52 ` [PATCH 10/10] xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c Ezekiel Newren via GitGitGadget
2026-01-21 15:01 ` Phillip Wood
2026-01-28 10:56 ` Phillip Wood
2026-01-04 2:44 ` [PATCH 00/10] Xdiff cleanup part 3 Junio C Hamano
2026-01-04 6:01 ` Yee Cheng Chin
2026-01-28 14:40 ` Phillip Wood
2026-03-06 23:03 ` Junio C Hamano
2026-03-09 19:06 ` Ezekiel Newren
2026-03-09 23:31 ` Junio C Hamano
2026-03-25 21:11 ` [PATCH v2 0/5] " Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 1/5] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 2/5] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 3/5] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 4/5] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 5/5] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-25 21:58 ` Junio C Hamano
2026-03-26 6:26 ` [PATCH v2 0/5] Xdiff cleanup part 3 SZEDER Gábor
2026-03-27 19:23 ` [PATCH v3 0/6] " Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-27 21:09 ` Junio C Hamano
2026-03-27 23:01 ` Junio C Hamano
2026-03-27 19:23 ` [PATCH v3 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 6/6] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=99b28295-fb14-4f4d-98d9-2caa9be88e33@gmail.com \
--to=phillip.wood123@gmail.com \
--cc=ezekielnewren@gmail.com \
--cc=git@vger.kernel.org \
--cc=gitgitgadget@gmail.com \
--cc=phillip.wood@dunelm.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox