From: Antoine Pelisse <apelisse@gmail.com>
To: git@vger.kernel.org
Cc: Antoine Pelisse <apelisse@gmail.com>, Junio C Hamano <gitster@pobox.com>
Subject: [PATCH] combine-diff: coalesce lost lines optimally
Date: Sun, 17 Mar 2013 14:03:56 +0100 [thread overview]
Message-ID: <1363525436-21667-1-git-send-email-apelisse@gmail.com> (raw)
In-Reply-To: <7vboalw6lt.fsf@alter.siamese.dyndns.org>
This replaces the greedy implementation to coalesce lost lines by using
dynamic programming to find the Longest Common Subsequence.
The O(n²) time complexity is obviously bigger than previous
implementation but it can produce shorter diff results (and most likely
easier to read).
List of lost lines is now doubly-linked because we reverse-read it when
reading the direction matrix.
Signed-off-by: Antoine Pelisse <apelisse@gmail.com>
---
Hi,
This is a very first draft for improving the way we coalesce lost
lines. It has only been tested with the two scenarios below.
What is left to do:
- Test it more extensively
- Had some tests scenarios
I'm also having a hard time trying it with more than two parents. How I
am supposed to have more than two parents while octopus merge refuses if
there are conflicts ?
Tested scenarios:
git init
>test
git add test
git commit -m initial
git checkout -b side1
seq 10 >>test
git commit -m all -a
git checkout master
seq 1 2 10 >test
git commit -m three -a
git merge side1
>test
git commit -m merge -a
git show
AND
git init
>test
git add test
git commit -m initial
echo "3\n1\n2\n4" >test
git commit -m shuffled -a
git checkout -b side HEAD^
echo "1\n2\n3\n4" >test
git commit -m sorted -a
git merge master
>test
git commit -m merge -a
git show
combine-diff.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 160 insertions(+), 32 deletions(-)
diff --git a/combine-diff.c b/combine-diff.c
index 35d41cd..252dd72 100644
--- a/combine-diff.c
+++ b/combine-diff.c
@@ -73,16 +73,24 @@ static struct combine_diff_path *intersect_paths(struct combine_diff_path *curr,
/* Lines lost from parent */
struct lline {
- struct lline *next;
+ struct lline *next, *prev;
int len;
unsigned long parent_map;
char line[FLEX_ARRAY];
};
+/* Lines lost from current parent (before coalescing) */
+struct plost {
+ struct lline *lost_head, *lost_tail;
+ int len;
+};
+
/* Lines surviving in the merge result */
struct sline {
- struct lline *lost_head, **lost_tail;
- struct lline *next_lost;
+ /* Accumulated and coalesced lost lines */
+ struct lline *lost;
+ int lenlost;
+ struct plost plost;
char *bol;
int len;
/* bit 0 up to (N-1) are on if the parent has this line (i.e.
@@ -94,6 +102,132 @@ struct sline {
unsigned long *p_lno;
};
+enum coalesce_direction { MATCH, BASE, NEW };
+
+/* Coalesce new lines into base by finding LCS */
+static struct lline *coalesce_lines(struct lline *base, int *lenbase,
+ struct lline *new, int lennew,
+ unsigned long parent)
+{
+ int **lcs;
+ enum coalesce_direction **directions;
+ struct lline *baseend, *newend;
+ int i, j, origbaselen = *lenbase;
+
+ if (new == NULL)
+ return base;
+
+ if (base == NULL) {
+ *lenbase = lennew;
+ return new;
+ }
+
+ /*
+ * Coalesce new lines into base by finding the LCS
+ * - Create the table to run dynamic programing
+ * - Compute the LCS
+ * - Then reverse read the direction structure:
+ * - If we have MATCH, assign parent to base flag, and consume
+ * both baseend and newend
+ * - Else if we have BASE, consume baseend
+ * - Else if we have NEW, insert newend lline into base and
+ * consume newend
+ */
+ lcs = xcalloc(origbaselen + 1, sizeof(int*));
+ directions = xcalloc(origbaselen + 1, sizeof(enum coalesce_direction*));
+ for (i = 0; i < origbaselen + 1; i++) {
+ lcs[i] = xcalloc(lennew + 1, sizeof(int));
+ directions[i] = xcalloc(lennew + 1, sizeof(enum coalesce_direction));
+ directions[i][0] = BASE;
+ }
+ for (j = 1; j < lennew + 1; j++)
+ directions[0][j] = NEW;
+
+ for (i = 1, baseend = base; i < origbaselen + 1; i++) {
+ for (j = 1, newend = new; j < lennew + 1; j++) {
+ if (baseend->len == newend->len &&
+ !memcmp(baseend->line, newend->line, baseend->len)) {
+ lcs[i][j] = lcs[i - 1][j - 1] + 1;
+ directions[i][j] = MATCH;
+ } else if (lcs[i][j - 1] >= lcs[i - 1][j]) {
+ lcs[i][j] = lcs[i][j - 1];
+ directions[i][j] = NEW;
+ } else {
+ lcs[i][j] = lcs[i - 1][j];
+ directions[i][j] = BASE;
+ }
+ if (newend->next)
+ newend = newend->next;
+ }
+ if (baseend->next)
+ baseend = baseend->next;
+ }
+
+ for (i = 0; i < origbaselen + 1; i++)
+ free(lcs[i]);
+ free(lcs);
+
+ /* At this point, baseend and newend point to the end of each lists */
+ i--;
+ j--;
+ while (i != 0 || j != 0) {
+ if (directions[i][j] == MATCH) {
+ baseend->parent_map |= 1<<parent;
+ baseend = baseend->prev;
+ newend = newend->prev;
+ i--;
+ j--;
+ } else if (directions[i][j] == NEW) {
+ struct lline *lline;
+
+ lline = newend;
+ /* Remove lline from new list and update newend */
+ if (lline->prev)
+ lline->prev->next = lline->next;
+ else
+ new = lline->next;
+ if (lline->next)
+ lline->next->prev = lline->prev;
+
+ newend = lline->prev;
+ j--;
+
+ /* Add lline to base list */
+ if (baseend) {
+ lline->next = baseend->next;
+ lline->prev = baseend;
+ if (lline->prev)
+ lline->prev->next = lline;
+ }
+ else {
+ lline->next = base;
+ base = lline;
+ }
+ (*lenbase)++;
+
+ if (lline->next)
+ lline->next->prev = lline;
+
+ } else {
+ baseend = baseend->prev;
+ i--;
+ }
+ }
+
+ newend = new;
+ while (newend) {
+ struct lline *lline = newend;
+ newend = newend->next;
+ free(lline);
+ }
+
+ for (i = 0; i < origbaselen + 1; i++)
+ free(directions[i]);
+ free(directions);
+
+ return base;
+}
+
static char *grab_blob(const unsigned char *sha1, unsigned int mode,
unsigned long *size, struct userdiff_driver *textconv,
const char *path)
@@ -129,29 +263,19 @@ static void append_lost(struct sline *sline, int n, const char *line, int len)
if (line[len-1] == '\n')
len--;
- /* Check to see if we can squash things */
- if (sline->lost_head) {
- lline = sline->next_lost;
- while (lline) {
- if (lline->len == len &&
- !memcmp(lline->line, line, len)) {
- lline->parent_map |= this_mask;
- sline->next_lost = lline->next;
- return;
- }
- lline = lline->next;
- }
- }
-
lline = xmalloc(sizeof(*lline) + len + 1);
lline->len = len;
lline->next = NULL;
+ lline->prev = sline->plost.lost_tail;
+ if (lline->prev)
+ lline->prev->next = lline;
+ else
+ sline->plost.lost_head = lline;
+ sline->plost.lost_tail = lline;
+ sline->plost.len++;
lline->parent_map = this_mask;
memcpy(lline->line, line, len);
lline->line[len] = 0;
- *sline->lost_tail = lline;
- sline->lost_tail = &lline->next;
- sline->next_lost = NULL;
}
struct combine_diff_state {
@@ -194,7 +318,6 @@ static void consume_line(void *state_, char *line, unsigned long len)
xcalloc(state->num_parent,
sizeof(unsigned long));
state->sline[state->nb-1].p_lno[state->n] = state->ob;
- state->lost_bucket->next_lost = state->lost_bucket->lost_head;
return;
}
if (!state->lost_bucket)
@@ -255,8 +378,17 @@ static void combine_diff(const unsigned char *parent, unsigned int mode,
struct lline *ll;
sline[lno].p_lno[n] = p_lno;
+ /* Coalesce new lines */
+ if (sline[lno].plost.lost_head) {
+ struct sline *sl = &sline[lno];
+ sl->lost = coalesce_lines(sl->lost, &sl->lenlost,
+ sl->plost.lost_head, sl->plost.len, n);
+ sl->plost.lost_head = sl->plost.lost_tail = NULL;
+ sl->plost.len = 0;
+ }
+
/* How many lines would this sline advance the p_lno? */
- ll = sline[lno].lost_head;
+ ll = sline[lno].lost;
while (ll) {
if (ll->parent_map & nmask)
p_lno++; /* '-' means parent had it */
@@ -276,7 +408,7 @@ static int interesting(struct sline *sline, unsigned long all_mask)
/* If some parents lost lines here, or if we have added to
* some parent, it is interesting.
*/
- return ((sline->flag & all_mask) || sline->lost_head);
+ return ((sline->flag & all_mask) || sline->lost);
}
static unsigned long adjust_hunk_tail(struct sline *sline,
@@ -459,7 +591,7 @@ static int make_hunks(struct sline *sline, unsigned long cnt,
has_interesting = 0;
for (j = i; j < hunk_end && !has_interesting; j++) {
unsigned long this_diff = sline[j].flag & all_mask;
- struct lline *ll = sline[j].lost_head;
+ struct lline *ll = sline[j].lost;
if (this_diff) {
/* This has some changes. Is it the
* same as others?
@@ -613,7 +745,7 @@ static void dump_sline(struct sline *sline, const char *line_prefix,
int j;
unsigned long p_mask;
struct sline *sl = &sline[lno++];
- ll = (sl->flag & no_pre_delete) ? NULL : sl->lost_head;
+ ll = (sl->flag & no_pre_delete) ? NULL : sl->lost;
while (ll) {
printf("%s%s", line_prefix, c_old);
for (j = 0; j < num_parent; j++) {
@@ -664,7 +796,7 @@ static void reuse_combine_diff(struct sline *sline, unsigned long cnt,
jmask = (1UL<<j);
for (lno = 0; lno <= cnt; lno++) {
- struct lline *ll = sline->lost_head;
+ struct lline *ll = sline->lost;
sline->p_lno[i] = sline->p_lno[j];
while (ll) {
if (ll->parent_map & jmask)
@@ -923,10 +1055,6 @@ static void show_patch_diff(struct combine_diff_path *elem, int num_parent,
sline = xcalloc(cnt+2, sizeof(*sline));
sline[0].bol = result;
- for (lno = 0; lno <= cnt + 1; lno++) {
- sline[lno].lost_tail = &sline[lno].lost_head;
- sline[lno].flag = 0;
- }
for (lno = 0, cp = result; cp < result + result_size; cp++) {
if (*cp == '\n') {
sline[lno].len = cp - sline[lno].bol;
@@ -976,8 +1104,8 @@ static void show_patch_diff(struct combine_diff_path *elem, int num_parent,
free(result);
for (lno = 0; lno < cnt; lno++) {
- if (sline[lno].lost_head) {
- struct lline *ll = sline[lno].lost_head;
+ if (sline[lno].lost) {
+ struct lline *ll = sline[lno].lost;
while (ll) {
struct lline *tmp = ll;
ll = ll->next;
--
1.7.9.5
next prev parent reply other threads:[~2013-03-17 13:05 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-14 22:32 [RFD] Combine diff improvements Antoine Pelisse
2013-03-14 22:52 ` Junio C Hamano
2013-03-15 5:54 ` Junio C Hamano
2013-03-17 13:03 ` Antoine Pelisse [this message]
2013-03-17 13:58 ` [PATCH] combine-diff: coalesce lost lines optimally Antoine Pelisse
2013-03-17 20:10 ` Junio C Hamano
2013-03-17 20:37 ` Antoine Pelisse
2013-03-17 22:37 ` Junio C Hamano
2013-03-23 17:23 ` [PATCH v2] " Antoine Pelisse
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1363525436-21667-1-git-send-email-apelisse@gmail.com \
--to=apelisse@gmail.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).