git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Davide Libenzi <davidel@xmailserver.org>
To: Junio C Hamano <junkio@cox.net>
Cc: git@vger.kernel.org, Linus Torvalds <torvalds@osdl.org>
Subject: Re: [RFH] xdiff shows trivially redundant diff.
Date: Mon, 3 Apr 2006 16:29:16 -0700 (PDT)	[thread overview]
Message-ID: <Pine.LNX.4.64.0604031625330.30048@alien.or.mcafeemobile.com> (raw)
In-Reply-To: <7vzmj3k7x9.fsf@assigned-by-dhcp.cox.net>

On Sun, 2 Apr 2006, Junio C Hamano wrote:

> Davide Libenzi <davidel@xmailserver.org> writes:
>
>> On Sun, 2 Apr 2006, Junio C Hamano wrote:
>>
>>> $ git diff-tree -p 52e8a6^2 52d8a6 -- git-fetch.sh
>>>
>>> shows a change that trivially is redundant, like this:
>>>
>>>    diff --git a/git-fetch.sh b/git-fetch.sh
>>>    index b4325d9..de4f011 100755
>>>    --- a/git-fetch.sh
>>>    +++ b/git-fetch.sh
>>>    @@ -320,7 +320,7 @@ fetch_main () {
>>> ..
>>> Notice the first '-' and '+' lines of second hunk are identical?
>>>
>>> There is another interesting thing.  This is running diff
>>> between 52e8a6^2 and 52d8a6 blobs, but if I change them slightly
>>> so that the first hunk is not different, then this anomaly
>>> disappears.
>>
>> Could you send me the two files that creates the above diff?
>
> I should have tried your pristine xdiff code myself before
> bothering you, but I haven't (sorry).
>
> The problem is from the "stripped down" version we use in git,
> so you may or may not see the problem in your version.  Attached
> are the files.

This is the change I made to libxdiff. Xregression already made a few 
thousands on iterations w/out problems.



- Davide



--- xdiff/xdiffi.c
+++ xdiff/xdiffi.c
@@ -349,12 +349,7 @@
  	kvdf += xe->xdf2.nreff + 1;
  	kvdb += xe->xdf2.nreff + 1;

-	/*
-	 * Classical integer square root approximation using shifts.
-	 */
-	xenv.mxcost = 1;
-	for (; ndiags; ndiags >>= 2)
-		xenv.mxcost <<= 1;
+	xenv.mxcost = xdl_bogosqrt(ndiags);
  	if (xenv.mxcost < XDL_MAX_COST_MIN)
  		xenv.mxcost = XDL_MAX_COST_MIN;
  	xenv.snake_cnt = XDL_SNAKE_CNT;


--- xdiff/xprepare.c
+++ xdiff/xprepare.c
@@ -25,6 +25,7 @@


  #define XDL_KPDIS_RUN 4
+#define XDL_MAX_EQLIMIT 1024



@@ -305,26 +306,48 @@


  static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
-	long r, rdis, rpdis;
+	long r, rdis0, rpdis0, rdis1, rpdis1;

-	for (r = 1, rdis = 0, rpdis = 1; (i - r) >= s; r++) {
+	/*
+	 * Scans the lines before 'i' to find a run of lines that either
+	 * have no match (dis[j] == 0) or have multiple matches (dis[j] > 1).
+	 * Note that we always call this function with dis[i] > 1, so the
+	 * current line (i) is already a multimatch line.
+	 */
+	for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
  		if (!dis[i - r])
-			rdis++;
+			rdis0++;
  		else if (dis[i - r] == 2)
-			rpdis++;
+			rpdis0++;
  		else
  			break;
  	}
-	for (r = 1; (i + r) <= e; r++) {
+	/*
+	 * If the run before the line 'i' found only multimatch lines, we
+	 * return 0 and hence we don't make the current line (i) discarded.
+	 * We want to discard multimatch lines only when they appear in the
+	 * middle of runs with nomatch lines (dis[j] == 0).
+	 */
+	if (rdis0 == 0)
+		return 0;
+	for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
  		if (!dis[i + r])
-			rdis++;
+			rdis1++;
  		else if (dis[i + r] == 2)
-			rpdis++;
+			rpdis1++;
  		else
  			break;
  	}
+	/*
+	 * If the run after the line 'i' found only multimatch lines, we
+	 * return 0 and hence we don't make the current line (i) discarded.
+	 */
+	if (rdis1 == 0)
+		return 0;
+	rdis1 += rdis0;
+	rpdis1 += rpdis0;

-	return rpdis * XDL_KPDIS_RUN < (rpdis + rdis);
+	return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
  }


@@ -334,34 +357,40 @@
   * might be potentially discarded if they happear in a run of discardable.
   */
  static int xdl_cleanup_records(xdfile_t *xdf1, xdfile_t *xdf2) {
-	long i, rhi, nreff;
+	long i, nm, rhi, nreff, mlim;
  	unsigned long hav;
  	xrecord_t **recs;
  	xrecord_t *rec;
  	char *dis, *dis1, *dis2;

-	if (!(dis = (char *) xdl_malloc((xdf1->nrec + xdf2->nrec + 2) * sizeof(char)))) {
+	if (!(dis = (char *) xdl_malloc(xdf1->nrec + xdf2->nrec + 2))) {

  		return -1;
  	}
-	memset(dis, 0, (xdf1->nrec + xdf2->nrec + 2) * sizeof(char));
+	memset(dis, 0, xdf1->nrec + xdf2->nrec + 2);
  	dis1 = dis;
  	dis2 = dis1 + xdf1->nrec + 1;

+	if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT)
+		mlim = XDL_MAX_EQLIMIT;
  	for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
  		hav = (*recs)->ha;
  		rhi = (long) XDL_HASHLONG(hav, xdf2->hbits);
-		for (rec = xdf2->rhash[rhi]; rec; rec = rec->next)
-			if (rec->ha == hav && ++dis1[i] == 2)
+		for (nm = 0, rec = xdf2->rhash[rhi]; rec; rec = rec->next)
+			if (rec->ha == hav && ++nm == mlim)
  				break;
+		dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
  	}

+	if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
+		mlim = XDL_MAX_EQLIMIT;
  	for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
  		hav = (*recs)->ha;
  		rhi = (long) XDL_HASHLONG(hav, xdf1->hbits);
-		for (rec = xdf1->rhash[rhi]; rec; rec = rec->next)
-			if (rec->ha == hav && ++dis2[i] == 2)
+		for (nm = 0, rec = xdf1->rhash[rhi]; rec; rec = rec->next)
+			if (rec->ha == hav && ++nm == mlim)
  				break;
+		dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
  	}

  	for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];


--- xdiff/xutils.c
+++ xdiff/xutils.c
@@ -29,6 +29,19 @@



+long xdl_bogosqrt(long n) {
+	long i;
+
+	/*
+	 * Classical integer square root approximation using shifts.
+	 */
+	for (i = 1; n > 0; n >>= 2)
+		i <<= 1;
+
+	return i;
+}
+
+
  int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize,
  		     xdemitcb_t *ecb) {
  	int i = 2;


--- xdiff/xutils.h
+++ xdiff/xutils.h
@@ -25,6 +25,7 @@



+long xdl_bogosqrt(long n);
  int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize,
  		     xdemitcb_t *ecb);
  int xdl_mmfile_outf(void *priv, mmbuffer_t *mb, int nbuf);

  parent reply	other threads:[~2006-04-03 23:29 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-04-02  9:15 [RFH] xdiff shows trivially redundant diff Junio C Hamano
2006-04-02 17:35 ` Davide Libenzi
2006-04-02 21:02   ` Junio C Hamano
2006-04-02 21:16     ` Linus Torvalds
2006-04-02 22:18       ` Davide Libenzi
2006-04-02 22:14     ` Davide Libenzi
2006-04-03  0:52       ` Linus Torvalds
2006-04-03  3:26         ` Davide Libenzi
2006-04-03  4:19           ` Linus Torvalds
2006-04-03  4:30             ` Davide Libenzi
2006-04-03  7:33               ` Junio C Hamano
2006-04-03 19:33                 ` Davide Libenzi
2006-04-03 19:42                   ` Davide Libenzi
2006-04-03 20:01                   ` Junio C Hamano
2006-04-03 22:50                     ` Davide Libenzi
2006-04-03 23:29     ` Davide Libenzi [this message]
2006-04-04  1:55       ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Pine.LNX.4.64.0604031625330.30048@alien.or.mcafeemobile.com \
    --to=davidel@xmailserver.org \
    --cc=git@vger.kernel.org \
    --cc=junkio@cox.net \
    --cc=torvalds@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).