* [PATCH] diff: round down similarity index
@ 2007-06-24 22:23 René Scharfe
2007-06-24 23:25 ` Johannes Schindelin
` (2 more replies)
0 siblings, 3 replies; 8+ messages in thread
From: René Scharfe @ 2007-06-24 22:23 UTC (permalink / raw)
To: Junio C Hamano; +Cc: David Kastrup, Johannes Schindelin, Git Mailing List
Rounding down the printed (dis)similarity index allows us to use
"100%" as a special value that indicates complete rewrites and
fully equal file contents, respectively.
Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
The added documentation is only correct if you apply the
diffcore-rename patch, too.
Documentation/diff-format.txt | 7 +++++++
diff.c | 30 +++++++++++++++++++-----------
2 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/Documentation/diff-format.txt b/Documentation/diff-format.txt
index 18d49d2..0015032 100644
--- a/Documentation/diff-format.txt
+++ b/Documentation/diff-format.txt
@@ -126,6 +126,13 @@ the file that rename/copy produces, respectively.
If there is need for such substitution then the whole
pathname is put in double quotes.
+The similarity index is the percentage of unchanged lines, and
+the dissimilarity index is the percentage of changed lines. It
+is a rounded down integer, followed by a percent sign. The
+similarity index value of 100% is thus reserved for two equal
+files, while 100% dissimilarity means that no line from the old
+file made it into the new one.
+
combined diff format
--------------------
diff --git a/diff.c b/diff.c
index 4aa9bbc..f1db55d 100644
--- a/diff.c
+++ b/diff.c
@@ -1813,6 +1813,19 @@ static void diff_fill_sha1_info(struct diff_filespec *one)
hashclr(one->sha1);
}
+static int similarity_index(struct diff_filepair *p)
+{
+ int result = p->score * 100.0 / MAX_SCORE;
+
+ /* Paranoia: guard against floating point rounding errors. */
+ if (p->score == MAX_SCORE)
+ result = 100;
+ else if (result == 100)
+ result = 99;
+
+ return result;
+}
+
static void run_diff(struct diff_filepair *p, struct diff_options *o)
{
const char *pgm = external_diff();
@@ -1847,23 +1860,20 @@ static void run_diff(struct diff_filepair *p, struct diff_options *o)
"similarity index %d%%\n"
"copy from %s\n"
"copy to %s\n",
- (int)(0.5 + p->score * 100.0/MAX_SCORE),
- name_munged, other_munged);
+ similarity_index(p), name_munged, other_munged);
break;
case DIFF_STATUS_RENAMED:
len += snprintf(msg + len, sizeof(msg) - len,
"similarity index %d%%\n"
"rename from %s\n"
"rename to %s\n",
- (int)(0.5 + p->score * 100.0/MAX_SCORE),
- name_munged, other_munged);
+ similarity_index(p), name_munged, other_munged);
break;
case DIFF_STATUS_MODIFIED:
if (p->score) {
len += snprintf(msg + len, sizeof(msg) - len,
"dissimilarity index %d%%\n",
- (int)(0.5 + p->score *
- 100.0/MAX_SCORE));
+ similarity_index(p));
complete_rewrite = 1;
break;
}
@@ -2385,8 +2395,7 @@ static void diff_flush_raw(struct diff_filepair *p,
}
if (p->score)
- sprintf(status, "%c%03d", p->status,
- (int)(0.5 + p->score * 100.0/MAX_SCORE));
+ sprintf(status, "%c%03d", p->status, similarity_index(p));
else {
status[0] = p->status;
status[1] = 0;
@@ -2668,8 +2677,7 @@ static void show_rename_copy(const char *renamecopy, struct diff_filepair *p)
{
char *names = pprint_rename(p->one->path, p->two->path);
- printf(" %s %s (%d%%)\n", renamecopy, names,
- (int)(0.5 + p->score * 100.0/MAX_SCORE));
+ printf(" %s %s (%d%%)\n", renamecopy, names, similarity_index(p));
free(names);
show_mode_change(p, 0);
}
@@ -2693,7 +2701,7 @@ static void diff_summary(struct diff_filepair *p)
if (p->score) {
char *name = quote_one(p->two->path);
printf(" rewrite %s (%d%%)\n", name,
- (int)(0.5 + p->score * 100.0/MAX_SCORE));
+ similarity_index(p));
free(name);
show_mode_change(p, 0);
} else show_mode_change(p, 1);
^ permalink raw reply related [flat|nested] 8+ messages in thread* Re: [PATCH] diff: round down similarity index
2007-06-24 22:23 [PATCH] diff: round down similarity index René Scharfe
@ 2007-06-24 23:25 ` Johannes Schindelin
2007-06-24 23:27 ` Johannes Schindelin
2007-06-25 5:08 ` David Kastrup
2007-06-25 6:18 ` Junio C Hamano
2 siblings, 1 reply; 8+ messages in thread
From: Johannes Schindelin @ 2007-06-24 23:25 UTC (permalink / raw)
To: René Scharfe; +Cc: Junio C Hamano, David Kastrup, Git Mailing List
[-- Attachment #1: Type: TEXT/PLAIN, Size: 904 bytes --]
Hi,
On Mon, 25 Jun 2007, René Scharfe wrote:
> Rounding down the printed (dis)similarity index allows us to use
> "100%" as a special value that indicates complete rewrites and
> fully equal file contents, respectively.
>
> [...]
>
> +static int similarity_index(struct diff_filepair *p)
> +{
> + int result = p->score * 100.0 / MAX_SCORE;
> +
> + /* Paranoia: guard against floating point rounding errors. */
> + if (p->score == MAX_SCORE)
> + result = 100;
> + else if (result == 100)
> + result = 99;
> +
> + return result;
> +}
That's not even properly rounding down. The correct formula (correct in
the sense for "what you want") would be
p->score * 100.0 / MAX_SCORE
if p->score == MAX_SCORE, iff the files are identical. And yes, that is
the old formula.
Besides, AFAIR p->score is not even calculated if the files are identical,
because that hits a different code path.
Ciao,
Dscho
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH] diff: round down similarity index
2007-06-24 23:25 ` Johannes Schindelin
@ 2007-06-24 23:27 ` Johannes Schindelin
2007-06-25 1:29 ` René Scharfe
0 siblings, 1 reply; 8+ messages in thread
From: Johannes Schindelin @ 2007-06-24 23:27 UTC (permalink / raw)
To: René Scharfe; +Cc: Junio C Hamano, David Kastrup, Git Mailing List
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: TEXT/PLAIN; charset=utf-8, Size: 1068 bytes --]
Hi,
On Mon, 25 Jun 2007, Johannes Schindelin wrote:
> On Mon, 25 Jun 2007, René Scharfe wrote:
>
> > Rounding down the printed (dis)similarity index allows us to use
> > "100%" as a special value that indicates complete rewrites and
> > fully equal file contents, respectively.
> >
> > [...]
> >
> > +static int similarity_index(struct diff_filepair *p)
> > +{
> > + int result = p->score * 100.0 / MAX_SCORE;
> > +
> > + /* Paranoia: guard against floating point rounding errors. */
> > + if (p->score == MAX_SCORE)
> > + result = 100;
> > + else if (result == 100)
> > + result = 99;
> > +
> > + return result;
> > +}
>
> That's not even properly rounding down. The correct formula (correct in
> the sense for "what you want") would be
>
> p->score * 100.0 / MAX_SCORE
>
> if p->score == MAX_SCORE, iff the files are identical. And yes, that is
> the old formula.
Just ignore that, please.
> Besides, AFAIR p->score is not even calculated if the files are identical,
> because that hits a different code path.
But this still holds true.
Ciao,
Dscho
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH] diff: round down similarity index
2007-06-24 23:27 ` Johannes Schindelin
@ 2007-06-25 1:29 ` René Scharfe
0 siblings, 0 replies; 8+ messages in thread
From: René Scharfe @ 2007-06-25 1:29 UTC (permalink / raw)
To: Johannes Schindelin; +Cc: Junio C Hamano, David Kastrup, Git Mailing List
Johannes Schindelin schrieb:
> Hi,
>
> On Mon, 25 Jun 2007, Johannes Schindelin wrote:
>
>> On Mon, 25 Jun 2007, Ren� Scharfe wrote:
>>
>>> Rounding down the printed (dis)similarity index allows us to use
>>> "100%" as a special value that indicates complete rewrites and
>>> fully equal file contents, respectively.
>>>
>>> [...]
>>>
>>> +static int similarity_index(struct diff_filepair *p)
>>> +{
>>> + int result = p->score * 100.0 / MAX_SCORE;
>>> +
>>> + /* Paranoia: guard against floating point rounding errors. */
>>> + if (p->score == MAX_SCORE)
>>> + result = 100;
>>> + else if (result == 100)
>>> + result = 99;
>>> +
>>> + return result;
>>> +}
>> Besides, AFAIR p->score is not even calculated if the files are identical,
>> because that hits a different code path.
True, ->score is set to MAX_SCORE for identical files by a different
code path than the one that actually compares the contents and
calculates a score. That doesn't matter for printing the "similarity
index nn%" line etc., though. Or is there a way for identical files to
end up with a ->score of 0 (or some other value != MAX_SCORE) that I missed?
In any case, the patch doesn't change the way the score is calculated,
i.e. its value is the same as before. It only changes how it is displayed.
René
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] diff: round down similarity index
2007-06-24 22:23 [PATCH] diff: round down similarity index René Scharfe
2007-06-24 23:25 ` Johannes Schindelin
@ 2007-06-25 5:08 ` David Kastrup
2007-06-25 6:18 ` Junio C Hamano
2 siblings, 0 replies; 8+ messages in thread
From: David Kastrup @ 2007-06-25 5:08 UTC (permalink / raw)
To: René Scharfe; +Cc: Junio C Hamano, Johannes Schindelin, Git Mailing List
René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:
> +static int similarity_index(struct diff_filepair *p)
> +{
> + int result = p->score * 100.0 / MAX_SCORE;
Use floor(p->score ... MAX_SCORE) here: I don't think that C otherwise
specifies a preferred way of rounding on float->int conversions.
> + /* Paranoia: guard against floating point rounding errors. */
> + if (p->score == MAX_SCORE)
> + result = 100;
> + else if (result == 100)
> + result = 99;
Also, p->score itself must be calculated by truncating.
--
David Kastrup, Kriemhildstr. 15, 44793 Bochum
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH] diff: round down similarity index
2007-06-24 22:23 [PATCH] diff: round down similarity index René Scharfe
2007-06-24 23:25 ` Johannes Schindelin
2007-06-25 5:08 ` David Kastrup
@ 2007-06-25 6:18 ` Junio C Hamano
[not found] ` <86k5tsh761.fsf@lola.quinscape.zz>
2007-06-25 10:34 ` [PATCH] diff: round down similarity index, take 2 René Scharfe
2 siblings, 2 replies; 8+ messages in thread
From: Junio C Hamano @ 2007-06-25 6:18 UTC (permalink / raw)
To: René Scharfe; +Cc: David Kastrup, Johannes Schindelin, Git Mailing List
René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:
> +static int similarity_index(struct diff_filepair *p)
> +{
> + int result = p->score * 100.0 / MAX_SCORE;
> +
> + /* Paranoia: guard against floating point rounding errors. */
> + if (p->score == MAX_SCORE)
> + result = 100;
> + else if (result == 100)
> + result = 99;
> +
> + return result;
> +}
Why not simply do this?
static int similarity_index(struct diff_filepair *p)
{
if (p->score == MAX_SCORE)
return 100;
return p->score * 100 / MAX_SCORE;
}
MAX_SCORE and p->score run up to 60000 and we assume int is at
least 32-bit, so I do not think there is no risk of overflowing.
^ permalink raw reply [flat|nested] 8+ messages in thread[parent not found: <86k5tsh761.fsf@lola.quinscape.zz>]
* Re: [PATCH] diff: round down similarity index
[not found] ` <86k5tsh761.fsf@lola.quinscape.zz>
@ 2007-06-25 10:34 ` René Scharfe
0 siblings, 0 replies; 8+ messages in thread
From: René Scharfe @ 2007-06-25 10:34 UTC (permalink / raw)
To: David Kastrup; +Cc: Junio C Hamano, Johannes Schindelin, Git Mailing List
David Kastrup schrieb:
> Junio C Hamano <gitster@pobox.com> writes:
>
>> René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:
>>
>>> +static int similarity_index(struct diff_filepair *p)
>>> +{
>>> + int result = p->score * 100.0 / MAX_SCORE;
>>> +
>>> + /* Paranoia: guard against floating point rounding errors. */
>>> + if (p->score == MAX_SCORE)
>>> + result = 100;
>>> + else if (result == 100)
>>> + result = 99;
>>> +
>>> + return result;
>>> +}
>> Why not simply do this?
>>
>> static int similarity_index(struct diff_filepair *p)
>> {
>> if (p->score == MAX_SCORE)
>> return 100;
>> return p->score * 100 / MAX_SCORE;
>> }
>>
>> MAX_SCORE and p->score run up to 60000 and we assume int is at
>> least 32-bit, so I do not think there is no risk of overflowing.
Good idea! *smacks himself against the forehead*
I just copied the old code (sans the addition of 0.5) and then tried
to work around it's rounding problem (which I'm not even certain
exists and matters). Look out for an updated patch..
> You bet me to it on the way to work. Anyway, when working with
> integers, the first two lines are definitely not required (I am
> assuming that p->score is integer as well but have not checked).
Right.
> What probably requires more thought is the calculation of p->score
> itself: if one has large numbers as a ratio (original vs changes
> lines), multiplying by 60000 before dividing by the second large
> numbers might exceed the size of the integers.
MAX_SCORE is defined as 60000.0, which makes it a double. AFAICS all
the multiplications with MAX_SCORE are done using floating point
arithmetic for that reason.
René
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] diff: round down similarity index, take 2
2007-06-25 6:18 ` Junio C Hamano
[not found] ` <86k5tsh761.fsf@lola.quinscape.zz>
@ 2007-06-25 10:34 ` René Scharfe
1 sibling, 0 replies; 8+ messages in thread
From: René Scharfe @ 2007-06-25 10:34 UTC (permalink / raw)
To: Junio C Hamano; +Cc: David Kastrup, Johannes Schindelin, Git Mailing List
Rounding down the printed (dis)similarity index allows us to use
"100%" as a special value that indicates complete rewrites and
fully equal file contents, respectively.
While at it, switch index calculation to integer arithmetic as
suggested by Junio and David Kastrup.
Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
The added documentation is only correct if you apply the
diffcore-rename patch, too.
Documentation/diff-format.txt | 7 +++++++
diff.c | 22 +++++++++++-----------
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/Documentation/diff-format.txt b/Documentation/diff-format.txt
index 18d49d2..0015032 100644
--- a/Documentation/diff-format.txt
+++ b/Documentation/diff-format.txt
@@ -126,6 +126,13 @@ the file that rename/copy produces, respectively.
If there is need for such substitution then the whole
pathname is put in double quotes.
+The similarity index is the percentage of unchanged lines, and
+the dissimilarity index is the percentage of changed lines. It
+is a rounded down integer, followed by a percent sign. The
+similarity index value of 100% is thus reserved for two equal
+files, while 100% dissimilarity means that no line from the old
+file made it into the new one.
+
combined diff format
--------------------
diff --git a/diff.c b/diff.c
index 9938969..81527bf 100644
--- a/diff.c
+++ b/diff.c
@@ -1813,6 +1813,11 @@ static void diff_fill_sha1_info(struct diff_filespec *one)
hashclr(one->sha1);
}
+static int similarity_index(struct diff_filepair *p)
+{
+ return p->score * 100 / (int)MAX_SCORE;
+}
+
static void run_diff(struct diff_filepair *p, struct diff_options *o)
{
const char *pgm = external_diff();
@@ -1847,23 +1852,20 @@ static void run_diff(struct diff_filepair *p, struct diff_options *o)
"similarity index %d%%\n"
"copy from %s\n"
"copy to %s\n",
- (int)(0.5 + p->score * 100.0/MAX_SCORE),
- name_munged, other_munged);
+ similarity_index(p), name_munged, other_munged);
break;
case DIFF_STATUS_RENAMED:
len += snprintf(msg + len, sizeof(msg) - len,
"similarity index %d%%\n"
"rename from %s\n"
"rename to %s\n",
- (int)(0.5 + p->score * 100.0/MAX_SCORE),
- name_munged, other_munged);
+ similarity_index(p), name_munged, other_munged);
break;
case DIFF_STATUS_MODIFIED:
if (p->score) {
len += snprintf(msg + len, sizeof(msg) - len,
"dissimilarity index %d%%\n",
- (int)(0.5 + p->score *
- 100.0/MAX_SCORE));
+ similarity_index(p));
complete_rewrite = 1;
break;
}
@@ -2387,8 +2389,7 @@ static void diff_flush_raw(struct diff_filepair *p,
}
if (p->score)
- sprintf(status, "%c%03d", p->status,
- (int)(0.5 + p->score * 100.0/MAX_SCORE));
+ sprintf(status, "%c%03d", p->status, similarity_index(p));
else {
status[0] = p->status;
status[1] = 0;
@@ -2670,8 +2671,7 @@ static void show_rename_copy(const char *renamecopy, struct diff_filepair *p)
{
char *names = pprint_rename(p->one->path, p->two->path);
- printf(" %s %s (%d%%)\n", renamecopy, names,
- (int)(0.5 + p->score * 100.0/MAX_SCORE));
+ printf(" %s %s (%d%%)\n", renamecopy, names, similarity_index(p));
free(names);
show_mode_change(p, 0);
}
@@ -2695,7 +2695,7 @@ static void diff_summary(struct diff_filepair *p)
if (p->score) {
char *name = quote_one(p->two->path);
printf(" rewrite %s (%d%%)\n", name,
- (int)(0.5 + p->score * 100.0/MAX_SCORE));
+ similarity_index(p));
free(name);
show_mode_change(p, 0);
} else show_mode_change(p, 1);
^ permalink raw reply related [flat|nested] 8+ messages in thread
end of thread, other threads:[~2007-06-25 10:35 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-06-24 22:23 [PATCH] diff: round down similarity index René Scharfe
2007-06-24 23:25 ` Johannes Schindelin
2007-06-24 23:27 ` Johannes Schindelin
2007-06-25 1:29 ` René Scharfe
2007-06-25 5:08 ` David Kastrup
2007-06-25 6:18 ` Junio C Hamano
[not found] ` <86k5tsh761.fsf@lola.quinscape.zz>
2007-06-25 10:34 ` René Scharfe
2007-06-25 10:34 ` [PATCH] diff: round down similarity index, take 2 René Scharfe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).