All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Peter Eriksen" <s022018@student.dtu.dk>
To: git@vger.kernel.org
Subject: [RFH] Exploration of an alternative diff_delta() algorithm
Date: Sun, 9 Apr 2006 16:31:17 +0200	[thread overview]
Message-ID: <20060409143117.GA23908@erlang.gbar.dtu.dk> (raw)

Greetings Gitlings,

I've been trying to implement an alternative algorithm
for diff_delta().  I'm getting close to something that
works, but now I'm stuck!  I think it has something to
do with pack-objects.c, but I'm not sure.  Here's the
first test that fails:

*** t5500-fetch-pack.sh ***
* FAIL 1: 1st pull
        git-fetch-pack -v .. B A > log.txt 2>&1
* FAIL 2: fsck
        git-fsck-objects --full > fsck.txt 2>&1
* FAIL 3: new object count after 1st pull
        test 33 = 0
* FAIL 4: minimal count
        test 33 = 0
* FAIL 5: repack && prune-packed in client
        (git-repack && git-prune-packed)2>>log.txt
*   ok 5: 2nd pull
*   ok 6: fsck
* FAIL 7: new object count after 2nd pull
        test 192 = 198
* FAIL 8: minimal count
        test 192 = 198
* FAIL 9: repack && prune-packed in client
        (git-repack && git-prune-packed)2>>log.txt
*   ok 9: 3rd pull
*   ok 10: fsck
* FAIL 11: new object count after 3rd pull
        test 3 = 228
* FAIL 12: minimal count
        test 3 = 30
* failed 8 among 12 test(s)

I've been looking all around the current diff_delta(), and I
can't see, what I'm missing.  Any ideas?  The file is meant to
replace the current diff-delta.c.

Peter

----->8--diff-delta.c----->8----
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "delta.h"


#define BASE 257
#define PREFIX_SIZE 3

#define SIZE 10
#define HASH_TABLE_SIZE (1<<SIZE)

#define DELTA_SIZE (1024 * 1024)


unsigned int init_hash(unsigned char* data) {
  return data[0]*BASE*BASE + data[1]*BASE + data[2];
}

unsigned int hash(unsigned char* data, unsigned int hash) {
  return (hash - data[-1]*BASE*BASE)*BASE + data[2];
}

#define GR_PRIME 0x9e370001
#define HASH(v) ((v * GR_PRIME) >> (32 - SIZE))

struct entry {
  char file;
  char* offset;
};


void flush(struct entry* table) {
  memset(table, 0, HASH_TABLE_SIZE * sizeof(struct entry));
}


int same_prefixes(char* data1, char* data2) {
  return !memcmp(data1, data2, PREFIX_SIZE);  
}


void encode_add(char* out, int* outpos, char* version_start, char* version_copy) {
  unsigned int size = version_copy - version_start;
  if (!size) return;
  int pos = *outpos;

  while(size > 127) {
    out[pos++] = 127;
    memcpy(out + pos, version_start, 127);
    pos += 127;
    version_start += 127;
    size -= 127;
  }
  out[pos++] = size;
  memcpy(out + pos, version_start, size);  
  pos += size;

  *outpos = pos;
}


void encode_copy(char* out, int* outpos, int offset, int size) {
     int pos = (*outpos) + 1;
     int i = 0x80;

     if (offset & 0xff) { out[pos++] = offset; i |= 0x01; }
     offset >>= 8;
     if (offset & 0xff) { out[pos++] = offset; i |= 0x02; }
     offset >>= 8;
     if (offset & 0xff) { out[pos++] = offset; i |= 0x04; }
     offset >>= 8;
     if (offset & 0xff) { out[pos++] = offset; i |= 0x08; }

     if (size & 0xff) { out[pos++] = size; i |= 0x10; }
     size >>= 8;
     if (size & 0xff) { out[pos++] = size; i |= 0x20; }

     out[*outpos] = i;
     *outpos = pos;
}



void encode_size(char* out, int* outpos, unsigned long size) {
  int pos = *outpos;
  out[pos] = size;
  size >>= 7;
  while (size) {
    out[pos++] |= 0x80;
    out[pos] = size;
    size >>= 7;
  }
  *outpos = ++pos;
}


void *diff_delta(void *from_buf, unsigned long from_size,
		 void *to_buf, unsigned long to_size,
		 unsigned long *delta_size,
		 unsigned long max_size) {
  int index;
  int l;
  char* base = from_buf;
  char* version = to_buf;
  unsigned long base_size = from_size;
  unsigned long version_size = to_size;

  char* base_copy = base;
  char* version_copy = version;
  struct entry* table = calloc(HASH_TABLE_SIZE, sizeof(struct entry));
  //int delta_alloc = DELTA_SIZE;
  char* delta = malloc(DELTA_SIZE);
  int deltapos = 0;
  char* base_top = base + base_size;
  char* version_top = version + version_size;

  encode_size(delta, &deltapos, base_size);
  encode_size(delta, &deltapos, version_size);

  char* base_offset = base;
  char* version_offset = version;
  unsigned int base_hash = init_hash(base);
  unsigned int version_hash = init_hash(version);
  char* version_start = version;

  while(base_offset + PREFIX_SIZE < base_top && 
	version_offset  + PREFIX_SIZE < version_top) {  
    // step2:
    
    index = HASH(base_hash);
    switch (table[index].file) {
    case '\0': {
      table[index].file = 'b';
      table[index].offset = base_offset;
      break;
    }
    case 'v': {
      if (same_prefixes(base_offset, table[index].offset)) {
	base_copy = base_offset;
	version_copy = table[index].offset;
	goto step3;
      } else break;
    }
    case 'b': break;
    default: printf("AAAAAARGH 2b\n");
    }
    
    index = HASH(version_hash);
    switch (table[index].file) {
    case '\0': {
      table[index].file = 'v';
      table[index].offset = version_offset;
      break;
    }
    case 'b': {
      if (same_prefixes(table[index].offset, version_offset)) {
	base_copy = table[index].offset;
	version_copy = version_offset;
	goto step3;
      } else break;
    }
    case 'v': break;
    default: printf("AAAAAARGH 2v\n");
    }
    
    base_offset++;
    version_offset++;

    base_hash = hash(base_offset, base_hash);
    version_hash = hash(version_offset, version_hash);
    continue;  //  goto step2;
    
  step3:
    l = 0;
    while(base_copy[l] == version_copy[l]) l++;
    base_offset = base_copy + l;
    version_offset = version_copy + l;
    
    /*
    // Make sure we don't run out of delta buffer when encoding.
    if((delta_alloc - deltapos) < 
       (version_start - version_copy) + 1 + 8 + (PREFIX_SIZE + 1)) {
      delta_alloc = delta_alloc * 3 / 2;
      delta = (char*) realloc(delta, delta_alloc);
    }
    */
	if(max_size && deltapos > max_size) {
		free(delta);
		free(table);
		return NULL;
	}

    // step4:
    encode_add(delta, &deltapos, version_start, version_copy);
    encode_copy(delta, &deltapos, base_copy - base, l);
    
    // step5:
    flush(table);
    
    version_start = version_offset;
    
    base_hash = init_hash(base_offset);
    version_hash = init_hash(version_offset);
    
  }  //  goto step2;
  
  encode_add(delta, &deltapos, version_start, version + version_size);
  *delta_size = deltapos;

  free(table);

  return delta;
}

             reply	other threads:[~2006-04-09 14:31 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-04-09 14:31 Peter Eriksen [this message]
2006-04-09 17:14 ` [RFH] Exploration of an alternative diff_delta() algorithm Nicolas Pitre
2006-04-09 17:34   ` Peter Eriksen
2006-04-09 17:45     ` Nicolas Pitre
2006-04-09 22:45       ` Peter Eriksen
2006-04-10  3:29         ` Nicolas Pitre
2006-04-09 17:40   ` Nicolas Pitre
2006-04-09 17:53     ` Peter Eriksen
2006-04-09 18:08       ` Nicolas Pitre

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060409143117.GA23908@erlang.gbar.dtu.dk \
    --to=s022018@student.dtu.dk \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.