[PATCH v2 1/3] patch-id: make it stable against hunk reordering

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 1/3] patch-id: make it stable against hunk reordering
@ 2014-03-28 12:30 Michael S. Tsirkin
  2014-03-28 12:30 ` [PATCH v2 2/3] patch-id: document new behaviour Michael S. Tsirkin
                   ` (4 more replies)
  0 siblings, 5 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2014-03-28 12:30 UTC (permalink / raw)
  To: git; +Cc: sunshine, jrnieder, peff, gitster

Patch id changes if you reorder hunks in a diff.
As the result is functionally equivalent, this is surprising to many
people.
In particular, reordering hunks is helpful to make patches
more readable (e.g. API header diff before implementation diff).
In git, it is often done e.g. using the "-O <orderfile>" option,
so supporting it better has value.

Hunks within file can be reordered manually provided
the same pathname can appear more than once in the input.

Change patch-id behaviour making it stable against
hunk reodering:
	- prepend header to each hunk (if not there)
		Note: POSIX requires patch to be robust against hunk reordering
		provided each diff hunk has a header:
		http://pubs.opengroup.org/onlinepubs/7908799/xcu/patch.html
		If the patch file contains more than one patch, patch will attempt to
		apply each of them as if they came from separate patch files. (In this
		case the name of the patch file must be determinable for each diff
		listing.)

	- calculate SHA1 hash for each hunk separately
	- sum all hashes to get patch id

Add a new flag --unstable to get the historical behaviour.

Add --stable which is a nop, for symmetry.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Changes from v1: documented motivation for supporting
hunk reordering (and not just file reordering).
No code changes.

Junio, you didn't respond so I'm not sure whether I convinced
you that supporting hunk reordering within file has value.
So I kept this functionality around for now, if
you think I should drop this, please let me know explicitly.
Thanks, and sorry about being dense!

 builtin/patch-id.c | 71 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 55 insertions(+), 16 deletions(-)

diff --git a/builtin/patch-id.c b/builtin/patch-id.c
index 3cfe02d..253ad87 100644
--- a/builtin/patch-id.c
+++ b/builtin/patch-id.c
@@ -1,17 +1,14 @@
 #include "builtin.h"
 
-static void flush_current_id(int patchlen, unsigned char *id, git_SHA_CTX *c)
+static void flush_current_id(int patchlen, unsigned char *id, unsigned char *result)
 {
-	unsigned char result[20];
 	char name[50];
 
 	if (!patchlen)
 		return;
 
-	git_SHA1_Final(result, c);
 	memcpy(name, sha1_to_hex(id), 41);
 	printf("%s %s\n", sha1_to_hex(result), name);
-	git_SHA1_Init(c);
 }
 
 static int remove_space(char *line)
@@ -56,10 +53,30 @@ static int scan_hunk_header(const char *p, int *p_before, int *p_after)
 	return 1;
 }
 
-static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct strbuf *line_buf)
+static void flush_one_hunk(unsigned char *result, git_SHA_CTX *ctx)
 {
-	int patchlen = 0, found_next = 0;
+	unsigned char hash[20];
+	unsigned short carry = 0;
+	int i;
+
+	git_SHA1_Final(hash, ctx);
+	git_SHA1_Init(ctx);
+	/* 20-byte sum, with carry */
+	for (i = 0; i < 20; ++i) {
+		carry += result[i] + hash[i];
+		result[i] = carry;
+		carry >>= 8;
+	}
+}
+static int get_one_patchid(unsigned char *next_sha1, unsigned char *result,
+			   struct strbuf *line_buf, int stable)
+{
+	int patchlen = 0, found_next = 0, hunks = 0;
 	int before = -1, after = -1;
+	git_SHA_CTX ctx, header_ctx;
+
+	git_SHA1_Init(&ctx);
+	hashclr(result);
 
 	while (strbuf_getwholeline(line_buf, stdin, '\n') != EOF) {
 		char *line = line_buf->buf;
@@ -99,6 +116,18 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
 			if (!memcmp(line, "@@ -", 4)) {
 				/* Parse next hunk, but ignore line numbers.  */
 				scan_hunk_header(line, &before, &after);
+				if (stable) {
+					if (hunks) {
+						flush_one_hunk(result, &ctx);
+						memcpy(&ctx, &header_ctx,
+						       sizeof ctx);
+					} else {
+						/* Save ctx for next hunk.  */
+						memcpy(&header_ctx, &ctx,
+						       sizeof ctx);
+					}
+				}
+				hunks++;
 				continue;
 			}
 
@@ -107,7 +136,10 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
 				break;
 
 			/* Else we're parsing another header.  */
+			if (stable && hunks)
+				flush_one_hunk(result, &ctx);
 			before = after = -1;
+			hunks = 0;
 		}
 
 		/* If we get here, we're inside a hunk.  */
@@ -119,39 +151,46 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
 		/* Compute the sha without whitespace */
 		len = remove_space(line);
 		patchlen += len;
-		git_SHA1_Update(ctx, line, len);
+		git_SHA1_Update(&ctx, line, len);
 	}
 
 	if (!found_next)
 		hashclr(next_sha1);
 
+	flush_one_hunk(result, &ctx);
+
 	return patchlen;
 }
 
-static void generate_id_list(void)
+static void generate_id_list(int stable)
 {
-	unsigned char sha1[20], n[20];
-	git_SHA_CTX ctx;
+	unsigned char sha1[20], n[20], result[20];
 	int patchlen;
 	struct strbuf line_buf = STRBUF_INIT;
 
-	git_SHA1_Init(&ctx);
 	hashclr(sha1);
 	while (!feof(stdin)) {
-		patchlen = get_one_patchid(n, &ctx, &line_buf);
-		flush_current_id(patchlen, sha1, &ctx);
+		patchlen = get_one_patchid(n, result, &line_buf, stable);
+		flush_current_id(patchlen, sha1, result);
 		hashcpy(sha1, n);
 	}
 	strbuf_release(&line_buf);
 }
 
-static const char patch_id_usage[] = "git patch-id < patch";
+static const char patch_id_usage[] = "git patch-id [--stable | --unstable] < patch";
 
 int cmd_patch_id(int argc, const char **argv, const char *prefix)
 {
-	if (argc != 1)
+	int stable;
+	if (argc == 2 && !strcmp(argv[1], "--stable"))
+		stable = 1;
+	else if (argc == 2 && !strcmp(argv[1], "--unstable"))
+		stable = 0;
+	else if (argc == 1)
+		stable = 1;
+	else
 		usage(patch_id_usage);
 
-	generate_id_list();
+	generate_id_list(stable);
 	return 0;
 }
-- 
MST

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 2/3] patch-id: document new behaviour
  2014-03-28 12:30 [PATCH v2 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
@ 2014-03-28 12:30 ` Michael S. Tsirkin
  2014-03-28 12:30 ` [PATCH v2 3/3] patch-id-test: test new --stable and --unstable flags Michael S. Tsirkin
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2014-03-28 12:30 UTC (permalink / raw)
  To: git; +Cc: sunshine, jrnieder, peff, gitster

Clarify that patch ID is now a sum of hashes, not a hash.
Document --stable and --unstable flags.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

No change from v1.

 Documentation/git-patch-id.txt | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/Documentation/git-patch-id.txt b/Documentation/git-patch-id.txt
index 312c3b1..1bc6d52 100644
--- a/Documentation/git-patch-id.txt
+++ b/Documentation/git-patch-id.txt
@@ -8,14 +8,14 @@ git-patch-id - Compute unique ID for a patch
 SYNOPSIS
 --------
 [verse]
-'git patch-id' < <patch>
+'git patch-id' [--stable | --unstable] < <patch>
 
 DESCRIPTION
 -----------
-A "patch ID" is nothing but a SHA-1 of the diff associated with a patch, with
-whitespace and line numbers ignored.  As such, it's "reasonably stable", but at
-the same time also reasonably unique, i.e., two patches that have the same "patch
-ID" are almost guaranteed to be the same thing.
+A "patch ID" is nothing but a sum of SHA-1 of the diff hunks associated with a
+patch, with whitespace and line numbers ignored.  As such, it's "reasonably
+stable", but at the same time also reasonably unique, i.e., two patches that
+have the same "patch ID" are almost guaranteed to be the same thing.
 
 IOW, you can use this thing to look for likely duplicate commits.
 
@@ -27,6 +27,17 @@ This can be used to make a mapping from patch ID to commit ID.
 
 OPTIONS
 -------
+
+--stable::
+	Use a symmetrical sum of hashes, such that order of
+	hunks in the diff does not affect the ID.
+	This is the default.
+
+--unstable::
+	Use a non-symmetrical sum of hashes, such that order of
+	hunks in the diff affects the ID.
+	This was the default value for git 1.9 and older.
+
 <patch>::
 	The diff to create the ID of.
 
-- 
MST

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v2 3/3] patch-id-test: test new --stable and --unstable flags
  2014-03-28 12:30 [PATCH v2 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
  2014-03-28 12:30 ` [PATCH v2 2/3] patch-id: document new behaviour Michael S. Tsirkin
@ 2014-03-28 12:30 ` Michael S. Tsirkin
  2014-03-28 18:20 ` [PATCH v2 1/3] patch-id: make it stable against hunk reordering Junio C Hamano
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2014-03-28 12:30 UTC (permalink / raw)
  To: git; +Cc: sunshine, jrnieder, peff, gitster

Verify that patch ID is now stable against hunk reordering.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

changes from v1:
	Use <<-\EOF to address comment by Eric Sunshine

 t/t4204-patch-id.sh | 68 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 63 insertions(+), 5 deletions(-)

diff --git a/t/t4204-patch-id.sh b/t/t4204-patch-id.sh
index d2c930d..44dfd33 100755
--- a/t/t4204-patch-id.sh
+++ b/t/t4204-patch-id.sh
@@ -5,12 +5,27 @@ test_description='git patch-id'
 . ./test-lib.sh
 
 test_expect_success 'setup' '
-	test_commit initial foo a &&
-	test_commit first foo b &&
+	test_commit initial-foo foo a &&
+	test_commit initial-bar bar a &&
+	echo b > foo &&
+	echo b > bar &&
+	git commit -a -m first &&
 	git checkout -b same HEAD^ &&
-	test_commit same-msg foo b &&
+	echo b > foo &&
+	echo b > bar &&
+	git commit -a -m same-msg &&
 	git checkout -b notsame HEAD^ &&
-	test_commit notsame-msg foo c
+	echo c > foo &&
+	echo c > bar &&
+	git commit -a -m notsame-msg &&
+	cat > bar-then-foo <<-\EOF &&
+		bar
+		foo
+		EOF
+	cat > foo-then-bar <<-\EOF
+		foo
+		bar
+		EOF
 '
 
 test_expect_success 'patch-id output is well-formed' '
@@ -23,11 +38,33 @@ calc_patch_id () {
 		sed "s# .*##" > patch-id_"$1"
 }
 
+calc_patch_id_unstable () {
+	git patch-id --unstable |
+		sed "s# .*##" > patch-id_"$1"
+}
+
+calc_patch_id_stable () {
+	git patch-id --stable |
+		sed "s# .*##" > patch-id_"$1"
+}
+
+
 get_patch_id () {
-	git log -p -1 "$1" | git patch-id |
+	git log -p -1 "$1" -O bar-then-foo -- | git patch-id |
 		sed "s# .*##" > patch-id_"$1"
 }
 
+get_patch_id_stable () {
+	git log -p -1 "$1" -O bar-then-foo | git patch-id --stable |
+		sed "s# .*##" > patch-id_"$1"
+}
+
+get_patch_id_unstable () {
+	git log -p -1 "$1" -O bar-then-foo | git patch-id --unstable |
+		sed "s# .*##" > patch-id_"$1"
+}
+
+
 test_expect_success 'patch-id detects equality' '
 	get_patch_id master &&
 	get_patch_id same &&
@@ -56,6 +93,27 @@ test_expect_success 'whitespace is irrelevant in footer' '
 	test_cmp patch-id_master patch-id_same
 '
 
+test_expect_success 'file order is irrelevant by default' '
+	get_patch_id master &&
+	git checkout same &&
+	git format-patch -1 --stdout -O foo-then-bar | calc_patch_id same &&
+	test_cmp patch-id_master patch-id_same
+'
+
+test_expect_success 'file order is irrelevant with --stable' '
+	get_patch_id_stable master &&
+	git checkout same &&
+	git format-patch -1 --stdout -O foo-then-bar | calc_patch_id_stable same &&
+	test_cmp patch-id_master patch-id_same
+'
+
+test_expect_success 'file order is relevant with --unstable' '
+	get_patch_id_unstable master &&
+	git checkout same &&
+	git format-patch -1 --stdout -O foo-then-bar | calc_patch_id_unstable notsame &&
+	! test_cmp patch-id_master patch-id_notsame
+'
+
 test_expect_success 'patch-id supports git-format-patch MIME output' '
 	get_patch_id master &&
 	git checkout same &&
-- 
MST

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/3] patch-id: make it stable against hunk reordering
  2014-03-28 12:30 [PATCH v2 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
  2014-03-28 12:30 ` [PATCH v2 2/3] patch-id: document new behaviour Michael S. Tsirkin
  2014-03-28 12:30 ` [PATCH v2 3/3] patch-id-test: test new --stable and --unstable flags Michael S. Tsirkin
@ 2014-03-28 18:20 ` Junio C Hamano
  2014-03-28 19:20 ` Junio C Hamano
  2014-03-28 22:12 ` Junio C Hamano
  4 siblings, 0 replies; 8+ messages in thread
From: Junio C Hamano @ 2014-03-28 18:20 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> Patch id changes if you reorder hunks in a diff.

Reording "files" is fine, and as we discussed, having multiple
patches that touch the same path is fine, but do not sound as if you
are allowing to reorder "hunks" inside a single patch that touch a
single file.

Thanks.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/3] patch-id: make it stable against hunk reordering
  2014-03-28 12:30 [PATCH v2 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
                   ` (2 preceding siblings ...)
  2014-03-28 18:20 ` [PATCH v2 1/3] patch-id: make it stable against hunk reordering Junio C Hamano
@ 2014-03-28 19:20 ` Junio C Hamano
  2014-03-30 10:52   ` Michael S. Tsirkin
  2014-03-28 22:12 ` Junio C Hamano
  4 siblings, 1 reply; 8+ messages in thread
From: Junio C Hamano @ 2014-03-28 19:20 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> Patch id changes if you reorder hunks in a diff.
> As the result is functionally equivalent, this is surprising to many
> people.
> In particular, reordering hunks is helpful to make patches
> more readable (e.g. API header diff before implementation diff).
> In git, it is often done e.g. using the "-O <orderfile>" option,
> so supporting it better has value.
>
> Hunks within file can be reordered manually provided
> the same pathname can appear more than once in the input.
>
> Change patch-id behaviour making it stable against
> hunk reodering:
> 	- prepend header to each hunk (if not there)
> 		Note: POSIX requires patch to be robust against hunk reordering
> 		provided each diff hunk has a header:
> 		http://pubs.opengroup.org/onlinepubs/7908799/xcu/patch.html
> 		If the patch file contains more than one patch, patch will attempt to
> 		apply each of them as if they came from separate patch files. (In this
> 		case the name of the patch file must be determinable for each diff
> 		listing.)
>
> 	- calculate SHA1 hash for each hunk separately
> 	- sum all hashes to get patch id
>
> Add a new flag --unstable to get the historical behaviour.
>
> Add --stable which is a nop, for symmetry.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> Changes from v1: documented motivation for supporting
> hunk reordering (and not just file reordering).
> No code changes.
>
> Junio, you didn't respond so I'm not sure whether I convinced
> you that supporting hunk reordering within file has value.
> So I kept this functionality around for now, if
> you think I should drop this, please let me know explicitly.
> Thanks, and sorry about being dense!

The motivation I read from the exchange was that:

 (1) we definitely want to support a mode that is stable with use of
     "diff -O" (i.e. reordering patches per paths);

 (2) supporting a patch with swapped "hunks" does not have any
     practical value.  When you have a patch to the same file F with
     two hunks starting at lines 20 and 40, manually reordering
     hunks to create a patch that shows the hunk that starts at line
     40 and then the other hunk that starts at line 20 is not a
     useful exercise;

 (3) but supporting a patch that touches the same path more than
     once is in line with what "patch" and "git apply" after
     7a07841c (git-apply: handle a patch that touches the same path
     more than once better, 2008-06-27) do.

In other words, the goal of this change would be to give the same id
for all these three:

    (A) straight-forward:

        diff --git a/foo.c b/foo.c
        --- a/foo.c
        +++ b/foo.c
        @@ -20,1 +20,1 @@

        -foo
        +bar

        @@ -40,1 +40,1 @@

        -frotz
        +xyzzy

    (B) as two patches concatenated together:

        diff --git a/foo.c b/foo.c
        --- a/foo.c
        +++ b/foo.c
        @@ -20,1 +20,1 @@

        -foo
        +bar

        diff --git a/foo.c b/foo.c
        --- a/foo.c
        +++ b/foo.c
        @@ -40,1 +40,1 @@

        -frotz
        +xyzzy

    (C) the same as (B) but with a swapped order:

        diff --git a/foo.c b/foo.c
        --- a/foo.c
        +++ b/foo.c
        @@ -40,1 +40,1 @@

        -frotz
        +xyzzy
        diff --git a/foo.c b/foo.c
        --- a/foo.c
        +++ b/foo.c
        @@ -20,1 +20,1 @@

        -foo
        +bar

Am I reading you correctly?

If that is the case, I think I can buy such a change.  It appears to
me that in addition to changing the way the bytes form multiple
hunks are hashed, it would need to hash the file-level headers
together with each hunk when processing input (A) in order to make
the output consistent with the output for the latter two.

Alternatively, you could hash the header for the same path only once
when processing input like (B) or (C) and mix.  That would also give
you the same result as processing (A) in a straight-forward way.

>  builtin/patch-id.c | 71 ++++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 55 insertions(+), 16 deletions(-)
>
> diff --git a/builtin/patch-id.c b/builtin/patch-id.c
> index 3cfe02d..253ad87 100644
> --- a/builtin/patch-id.c
> +++ b/builtin/patch-id.c
> @@ -1,17 +1,14 @@
>  #include "builtin.h"
>  
> -static void flush_current_id(int patchlen, unsigned char *id, git_SHA_CTX *c)
> +static void flush_current_id(int patchlen, unsigned char *id, unsigned char *result)
>  {
> -	unsigned char result[20];
>  	char name[50];
>  
>  	if (!patchlen)
>  		return;
>  
> -	git_SHA1_Final(result, c);
>  	memcpy(name, sha1_to_hex(id), 41);
>  	printf("%s %s\n", sha1_to_hex(result), name);
> -	git_SHA1_Init(c);
>  }
>  
>  static int remove_space(char *line)
> @@ -56,10 +53,30 @@ static int scan_hunk_header(const char *p, int *p_before, int *p_after)
>  	return 1;
>  }
>  
> -static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct strbuf *line_buf)
> +static void flush_one_hunk(unsigned char *result, git_SHA_CTX *ctx)
>  {
> -	int patchlen = 0, found_next = 0;
> +	unsigned char hash[20];
> +	unsigned short carry = 0;
> +	int i;
> +
> +	git_SHA1_Final(hash, ctx);
> +	git_SHA1_Init(ctx);
> +	/* 20-byte sum, with carry */
> +	for (i = 0; i < 20; ++i) {
> +		carry += result[i] + hash[i];
> +		result[i] = carry;
> +		carry >>= 8;
> +	}
> +}
> +static int get_one_patchid(unsigned char *next_sha1, unsigned char *result,
> +			   struct strbuf *line_buf, int stable)
> +{
> +	int patchlen = 0, found_next = 0, hunks = 0;
>  	int before = -1, after = -1;
> +	git_SHA_CTX ctx, header_ctx;
> +
> +	git_SHA1_Init(&ctx);
> +	hashclr(result);
>  
>  	while (strbuf_getwholeline(line_buf, stdin, '\n') != EOF) {
>  		char *line = line_buf->buf;
> @@ -99,6 +116,18 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>  			if (!memcmp(line, "@@ -", 4)) {
>  				/* Parse next hunk, but ignore line numbers.  */
>  				scan_hunk_header(line, &before, &after);
> +				if (stable) {
> +					if (hunks) {
> +						flush_one_hunk(result, &ctx);
> +						memcpy(&ctx, &header_ctx,
> +						       sizeof ctx);
> +					} else {
> +						/* Save ctx for next hunk.  */
> +						memcpy(&header_ctx, &ctx,
> +						       sizeof ctx);
> +					}
> +				}
> +				hunks++;
>  				continue;
>  			}
>  
> @@ -107,7 +136,10 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>  				break;
>  
>  			/* Else we're parsing another header.  */
> +			if (stable && hunks)
> +				flush_one_hunk(result, &ctx);
>  			before = after = -1;
> +			hunks = 0;
>  		}
>  
>  		/* If we get here, we're inside a hunk.  */
> @@ -119,39 +151,46 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>  		/* Compute the sha without whitespace */
>  		len = remove_space(line);
>  		patchlen += len;
> -		git_SHA1_Update(ctx, line, len);
> +		git_SHA1_Update(&ctx, line, len);
>  	}
>  
>  	if (!found_next)
>  		hashclr(next_sha1);
>  
> +	flush_one_hunk(result, &ctx);
> +
>  	return patchlen;
>  }
>  
> -static void generate_id_list(void)
> +static void generate_id_list(int stable)
>  {
> -	unsigned char sha1[20], n[20];
> -	git_SHA_CTX ctx;
> +	unsigned char sha1[20], n[20], result[20];
>  	int patchlen;
>  	struct strbuf line_buf = STRBUF_INIT;
>  
> -	git_SHA1_Init(&ctx);
>  	hashclr(sha1);
>  	while (!feof(stdin)) {
> -		patchlen = get_one_patchid(n, &ctx, &line_buf);
> -		flush_current_id(patchlen, sha1, &ctx);
> +		patchlen = get_one_patchid(n, result, &line_buf, stable);
> +		flush_current_id(patchlen, sha1, result);
>  		hashcpy(sha1, n);
>  	}
>  	strbuf_release(&line_buf);
>  }
>  
> -static const char patch_id_usage[] = "git patch-id < patch";
> +static const char patch_id_usage[] = "git patch-id [--stable | --unstable] < patch";
>  
>  int cmd_patch_id(int argc, const char **argv, const char *prefix)
>  {
> -	if (argc != 1)
> +	int stable;
> +	if (argc == 2 && !strcmp(argv[1], "--stable"))
> +		stable = 1;
> +	else if (argc == 2 && !strcmp(argv[1], "--unstable"))
> +		stable = 0;
> +	else if (argc == 1)
> +		stable = 1;
> +	else
>  		usage(patch_id_usage);
>  
> -	generate_id_list();
> +	generate_id_list(stable);
>  	return 0;
>  }

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/3] patch-id: make it stable against hunk reordering
  2014-03-28 12:30 [PATCH v2 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
                   ` (3 preceding siblings ...)
  2014-03-28 19:20 ` Junio C Hamano
@ 2014-03-28 22:12 ` Junio C Hamano
  2014-03-28 22:26   ` Junio C Hamano
  4 siblings, 1 reply; 8+ messages in thread
From: Junio C Hamano @ 2014-03-28 22:12 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

"Michael S. Tsirkin" <mst@redhat.com> writes:

> +static void flush_one_hunk(unsigned char *result, git_SHA_CTX *ctx)
>  {
> -	int patchlen = 0, found_next = 0;
> +	unsigned char hash[20];
> +	unsigned short carry = 0;
> +	int i;
> +
> +	git_SHA1_Final(hash, ctx);
> +	git_SHA1_Init(ctx);
> +	/* 20-byte sum, with carry */
> +	for (i = 0; i < 20; ++i) {
> +		carry += result[i] + hash[i];
> +		result[i] = carry;
> +		carry >>= 8;
> +	}

Was there a reason why bitwise xor is not sufficient for mixing
these two indenendent hashes?  If the 20-byte sums do not offer
benefit over that, the code for bitwise xor would be certainly be
simpler, I would imagine?

> +}
> +static int get_one_patchid(unsigned char *next_sha1, unsigned char *result,
> +			   struct strbuf *line_buf, int stable)
> +{
> +	int patchlen = 0, found_next = 0, hunks = 0;
>  	int before = -1, after = -1;
> +	git_SHA_CTX ctx, header_ctx;
> +
> +	git_SHA1_Init(&ctx);
> +	hashclr(result);
>  
>  	while (strbuf_getwholeline(line_buf, stdin, '\n') != EOF) {
>  		char *line = line_buf->buf;
> @@ -99,6 +116,18 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>  			if (!memcmp(line, "@@ -", 4)) {
>  				/* Parse next hunk, but ignore line numbers.  */
>  				scan_hunk_header(line, &before, &after);
> +				if (stable) {
> +					if (hunks) {
> +						flush_one_hunk(result, &ctx);
> +						memcpy(&ctx, &header_ctx,
> +						       sizeof ctx);
> +					} else {
> +						/* Save ctx for next hunk.  */
> +						memcpy(&header_ctx, &ctx,
> +						       sizeof ctx);
> +					}
> +				}
> +				hunks++;
>  				continue;
>  			}
>  
> @@ -107,7 +136,10 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>  				break;
>  
>  			/* Else we're parsing another header.  */
> +			if (stable && hunks)
> +				flush_one_hunk(result, &ctx);
>  			before = after = -1;
> +			hunks = 0;
>  		}
>  
>  		/* If we get here, we're inside a hunk.  */
> @@ -119,39 +151,46 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>  		/* Compute the sha without whitespace */
>  		len = remove_space(line);
>  		patchlen += len;
> -		git_SHA1_Update(ctx, line, len);
> +		git_SHA1_Update(&ctx, line, len);
>  	}
>  
>  	if (!found_next)
>  		hashclr(next_sha1);
>  
> +	flush_one_hunk(result, &ctx);

What I read from these changes is that you do not do anything
special about the per-file header, so two no overlapping patches
with a single hunk each that touches the same path concatenated
together would not result in the same patch-id as a single-patch
that has the same two hunks.  Which would break your earlier 'Yes,
reordering only the hunks will not make sense, but before each hunk
you could insert the same "diff --git a/... b/..." to make them a
concatenation of patches that touch the same file', I would think.

Is that what we want to happen?  Or is my reading mistaken?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/3] patch-id: make it stable against hunk reordering
  2014-03-28 22:12 ` Junio C Hamano
@ 2014-03-28 22:26   ` Junio C Hamano
  0 siblings, 0 replies; 8+ messages in thread
From: Junio C Hamano @ 2014-03-28 22:26 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: git, sunshine, jrnieder, peff

Junio C Hamano <gitster@pobox.com> writes:

>> @@ -99,6 +116,18 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>>  			if (!memcmp(line, "@@ -", 4)) {
>>  				/* Parse next hunk, but ignore line numbers.  */
>>  				scan_hunk_header(line, &before, &after);
>> +				if (stable) {
>> +					if (hunks) {
>> +						flush_one_hunk(result, &ctx);
>> +						memcpy(&ctx, &header_ctx,
>> +						       sizeof ctx);
>> +					} else {
>> +						/* Save ctx for next hunk.  */
>> +						memcpy(&header_ctx, &ctx,
>> +						       sizeof ctx);
>> +					}
>> +				}
>> +				hunks++;
>>  				continue;
>>  			}
>>  
>> @@ -107,7 +136,10 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>>  				break;
>>  
>>  			/* Else we're parsing another header.  */
>> +			if (stable && hunks)
>> +				flush_one_hunk(result, &ctx);
>>  			before = after = -1;
>> +			hunks = 0;
>>  		}
>>  
>>  		/* If we get here, we're inside a hunk.  */
>> @@ -119,39 +151,46 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
>>  		/* Compute the sha without whitespace */
>>  		len = remove_space(line);
>>  		patchlen += len;
>> -		git_SHA1_Update(ctx, line, len);
>> +		git_SHA1_Update(&ctx, line, len);
>>  	}
>>  
>>  	if (!found_next)
>>  		hashclr(next_sha1);
>>  
>> +	flush_one_hunk(result, &ctx);
>
> What I read from these changes is that you do not do anything
> special about the per-file header, so two no overlapping patches
> with a single hunk each that touches the same path concatenated
> together would not result in the same patch-id as a single-patch
> that has the same two hunks.  Which would break your earlier 'Yes,
> reordering only the hunks will not make sense, but before each hunk
> you could insert the same "diff --git a/... b/..." to make them a
> concatenation of patches that touch the same file', I would think.
>
> Is that what we want to happen?  Or is my reading mistaken?

Heh, I was blind---copying into header_ctx and then restoring that
in preparation for the next round is exactly for duplicating the
per-file header sum to each and every hunk, so you are already doing
that.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 1/3] patch-id: make it stable against hunk reordering
  2014-03-28 19:20 ` Junio C Hamano
@ 2014-03-30 10:52   ` Michael S. Tsirkin
  0 siblings, 0 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2014-03-30 10:52 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, sunshine, jrnieder, peff

On Fri, Mar 28, 2014 at 12:20:13PM -0700, Junio C Hamano wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > Patch id changes if you reorder hunks in a diff.
> > As the result is functionally equivalent, this is surprising to many
> > people.
> > In particular, reordering hunks is helpful to make patches
> > more readable (e.g. API header diff before implementation diff).
> > In git, it is often done e.g. using the "-O <orderfile>" option,
> > so supporting it better has value.
> >
> > Hunks within file can be reordered manually provided
> > the same pathname can appear more than once in the input.
> >
> > Change patch-id behaviour making it stable against
> > hunk reodering:
> > 	- prepend header to each hunk (if not there)
> > 		Note: POSIX requires patch to be robust against hunk reordering
> > 		provided each diff hunk has a header:
> > 		http://pubs.opengroup.org/onlinepubs/7908799/xcu/patch.html
> > 		If the patch file contains more than one patch, patch will attempt to
> > 		apply each of them as if they came from separate patch files. (In this
> > 		case the name of the patch file must be determinable for each diff
> > 		listing.)
> >
> > 	- calculate SHA1 hash for each hunk separately
> > 	- sum all hashes to get patch id
> >
> > Add a new flag --unstable to get the historical behaviour.
> >
> > Add --stable which is a nop, for symmetry.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >
> > Changes from v1: documented motivation for supporting
> > hunk reordering (and not just file reordering).
> > No code changes.
> >
> > Junio, you didn't respond so I'm not sure whether I convinced
> > you that supporting hunk reordering within file has value.
> > So I kept this functionality around for now, if
> > you think I should drop this, please let me know explicitly.
> > Thanks, and sorry about being dense!
> 
> The motivation I read from the exchange was that:
> 
>  (1) we definitely want to support a mode that is stable with use of
>      "diff -O" (i.e. reordering patches per paths);
> 
>  (2) supporting a patch with swapped "hunks" does not have any
>      practical value.  When you have a patch to the same file F with
>      two hunks starting at lines 20 and 40, manually reordering
>      hunks to create a patch that shows the hunk that starts at line
>      40 and then the other hunk that starts at line 20 is not a
>      useful exercise;

I agree, especially since the resulting patch can't be applied with
either patch or git apply.

>  (3) but supporting a patch that touches the same path more than
>      once is in line with what "patch" and "git apply" after
>      7a07841c (git-apply: handle a patch that touches the same path
>      more than once better, 2008-06-27) do.
> 
> In other words, the goal of this change would be to give the same id
> for all these three:
> 
>     (A) straight-forward:
> 
>         diff --git a/foo.c b/foo.c
>         --- a/foo.c
>         +++ b/foo.c
>         @@ -20,1 +20,1 @@
> 
>         -foo
>         +bar
> 
>         @@ -40,1 +40,1 @@
> 
>         -frotz
>         +xyzzy
> 
>     (B) as two patches concatenated together:
> 
>         diff --git a/foo.c b/foo.c
>         --- a/foo.c
>         +++ b/foo.c
>         @@ -20,1 +20,1 @@
> 
>         -foo
>         +bar
> 
>         diff --git a/foo.c b/foo.c
>         --- a/foo.c
>         +++ b/foo.c
>         @@ -40,1 +40,1 @@
> 
>         -frotz
>         +xyzzy
> 
>     (C) the same as (B) but with a swapped order:
> 
>         diff --git a/foo.c b/foo.c
>         --- a/foo.c
>         +++ b/foo.c
>         @@ -40,1 +40,1 @@
> 
>         -frotz
>         +xyzzy
>         diff --git a/foo.c b/foo.c
>         --- a/foo.c
>         +++ b/foo.c
>         @@ -20,1 +20,1 @@
> 
>         -foo
>         +bar
> 
> Am I reading you correctly?

Absolutely.

> If that is the case, I think I can buy such a change.  It appears to
> me that in addition to changing the way the bytes form multiple
> hunks are hashed, it would need to hash the file-level headers
> together with each hunk when processing input (A) in order to make
> the output consistent with the output for the latter two.

Exactly. This is what this change attempts to do:
+                                   if (hunks) {
+                                           flush_one_hunk(result, &ctx);
+                                           memcpy(&ctx, &header_ctx,
+                                                  sizeof ctx);
+                                   } else {
+                                           /* Save ctx for next hunk.  */
+                                           memcpy(&header_ctx, &ctx,
+                                                  sizeof ctx);
+                                   }

I'll add more code comments to clarify the logic here.

> Alternatively, you could hash the header for the same path only once
> when processing input like (B) or (C) and mix.  That would also give
> you the same result as processing (A) in a straight-forward way.


Yes, that is also possible.
I think I slightly prefer the former way because this
variant gives the same result for
         diff --git a/foo.c b/foo.c
         --- a/foo.c
         +++ b/foo.c
         @@ -20,2 +20,3 @@
 
         +bar
 
         diff --git a/bar.c b/bar.c
         --- a/bar.c
         +++ b/bar.c
         @@ -40,2 +40,3 @@
 
         +foo

and
         diff --git a/bar.c b/bar.c
         --- a/bar.c
         +++ b/bar.c
         @@ -20,2 +20,3 @@
 
         +foo
 
         diff --git a/foo.c b/foo.c
         --- a/foo.c
         +++ b/foo.c
         @@ -40,2 +40,3 @@
 
         +bar


but if you disagree, pls let me know - it's not
a strong preference.

> >  builtin/patch-id.c | 71 ++++++++++++++++++++++++++++++++++++++++++------------
> >  1 file changed, 55 insertions(+), 16 deletions(-)
> >
> > diff --git a/builtin/patch-id.c b/builtin/patch-id.c
> > index 3cfe02d..253ad87 100644
> > --- a/builtin/patch-id.c
> > +++ b/builtin/patch-id.c
> > @@ -1,17 +1,14 @@
> >  #include "builtin.h"
> >  
> > -static void flush_current_id(int patchlen, unsigned char *id, git_SHA_CTX *c)
> > +static void flush_current_id(int patchlen, unsigned char *id, unsigned char *result)
> >  {
> > -	unsigned char result[20];
> >  	char name[50];
> >  
> >  	if (!patchlen)
> >  		return;
> >  
> > -	git_SHA1_Final(result, c);
> >  	memcpy(name, sha1_to_hex(id), 41);
> >  	printf("%s %s\n", sha1_to_hex(result), name);
> > -	git_SHA1_Init(c);
> >  }
> >  
> >  static int remove_space(char *line)
> > @@ -56,10 +53,30 @@ static int scan_hunk_header(const char *p, int *p_before, int *p_after)
> >  	return 1;
> >  }
> >  
> > -static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct strbuf *line_buf)
> > +static void flush_one_hunk(unsigned char *result, git_SHA_CTX *ctx)
> >  {
> > -	int patchlen = 0, found_next = 0;
> > +	unsigned char hash[20];
> > +	unsigned short carry = 0;
> > +	int i;
> > +
> > +	git_SHA1_Final(hash, ctx);
> > +	git_SHA1_Init(ctx);
> > +	/* 20-byte sum, with carry */
> > +	for (i = 0; i < 20; ++i) {
> > +		carry += result[i] + hash[i];
> > +		result[i] = carry;
> > +		carry >>= 8;
> > +	}
> > +}
> > +static int get_one_patchid(unsigned char *next_sha1, unsigned char *result,
> > +			   struct strbuf *line_buf, int stable)
> > +{
> > +	int patchlen = 0, found_next = 0, hunks = 0;
> >  	int before = -1, after = -1;
> > +	git_SHA_CTX ctx, header_ctx;
> > +
> > +	git_SHA1_Init(&ctx);
> > +	hashclr(result);
> >  
> >  	while (strbuf_getwholeline(line_buf, stdin, '\n') != EOF) {
> >  		char *line = line_buf->buf;
> > @@ -99,6 +116,18 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
> >  			if (!memcmp(line, "@@ -", 4)) {
> >  				/* Parse next hunk, but ignore line numbers.  */
> >  				scan_hunk_header(line, &before, &after);
> > +				if (stable) {
> > +					if (hunks) {
> > +						flush_one_hunk(result, &ctx);
> > +						memcpy(&ctx, &header_ctx,
> > +						       sizeof ctx);
> > +					} else {
> > +						/* Save ctx for next hunk.  */
> > +						memcpy(&header_ctx, &ctx,
> > +						       sizeof ctx);
> > +					}
> > +				}
> > +				hunks++;
> >  				continue;
> >  			}
> >  
> > @@ -107,7 +136,10 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
> >  				break;
> >  
> >  			/* Else we're parsing another header.  */
> > +			if (stable && hunks)
> > +				flush_one_hunk(result, &ctx);
> >  			before = after = -1;
> > +			hunks = 0;
> >  		}
> >  
> >  		/* If we get here, we're inside a hunk.  */
> > @@ -119,39 +151,46 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
> >  		/* Compute the sha without whitespace */
> >  		len = remove_space(line);
> >  		patchlen += len;
> > -		git_SHA1_Update(ctx, line, len);
> > +		git_SHA1_Update(&ctx, line, len);
> >  	}
> >  
> >  	if (!found_next)
> >  		hashclr(next_sha1);
> >  
> > +	flush_one_hunk(result, &ctx);
> > +
> >  	return patchlen;
> >  }
> >  
> > -static void generate_id_list(void)
> > +static void generate_id_list(int stable)
> >  {
> > -	unsigned char sha1[20], n[20];
> > -	git_SHA_CTX ctx;
> > +	unsigned char sha1[20], n[20], result[20];
> >  	int patchlen;
> >  	struct strbuf line_buf = STRBUF_INIT;
> >  
> > -	git_SHA1_Init(&ctx);
> >  	hashclr(sha1);
> >  	while (!feof(stdin)) {
> > -		patchlen = get_one_patchid(n, &ctx, &line_buf);
> > -		flush_current_id(patchlen, sha1, &ctx);
> > +		patchlen = get_one_patchid(n, result, &line_buf, stable);
> > +		flush_current_id(patchlen, sha1, result);
> >  		hashcpy(sha1, n);
> >  	}
> >  	strbuf_release(&line_buf);
> >  }
> >  
> > -static const char patch_id_usage[] = "git patch-id < patch";
> > +static const char patch_id_usage[] = "git patch-id [--stable | --unstable] < patch";
> >  
> >  int cmd_patch_id(int argc, const char **argv, const char *prefix)
> >  {
> > -	if (argc != 1)
> > +	int stable;
> > +	if (argc == 2 && !strcmp(argv[1], "--stable"))
> > +		stable = 1;
> > +	else if (argc == 2 && !strcmp(argv[1], "--unstable"))
> > +		stable = 0;
> > +	else if (argc == 1)
> > +		stable = 1;
> > +	else
> >  		usage(patch_id_usage);
> >  
> > -	generate_id_list();
> > +	generate_id_list(stable);
> >  	return 0;
> >  }

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-03-30 10:52 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-03-28 12:30 [PATCH v2 1/3] patch-id: make it stable against hunk reordering Michael S. Tsirkin
2014-03-28 12:30 ` [PATCH v2 2/3] patch-id: document new behaviour Michael S. Tsirkin
2014-03-28 12:30 ` [PATCH v2 3/3] patch-id-test: test new --stable and --unstable flags Michael S. Tsirkin
2014-03-28 18:20 ` [PATCH v2 1/3] patch-id: make it stable against hunk reordering Junio C Hamano
2014-03-28 19:20 ` Junio C Hamano
2014-03-30 10:52   ` Michael S. Tsirkin
2014-03-28 22:12 ` Junio C Hamano
2014-03-28 22:26   ` Junio C Hamano

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).