* Re: cvs2svn and git progress
2006-08-10 7:28 ` Martin Langhoff
@ 2006-08-10 13:32 ` Jon Smirl
0 siblings, 0 replies; 4+ messages in thread
From: Jon Smirl @ 2006-08-10 13:32 UTC (permalink / raw)
To: Martin Langhoff; +Cc: git, Shawn Pearce
[-- Attachment #1: Type: text/plain, Size: 375 bytes --]
On 8/10/06, Martin Langhoff <martin.langhoff@gmail.com> wrote:
> On 8/10/06, Jon Smirl <jonsmirl@gmail.com> wrote:
> > I've finally got cvs2svn running through pass 7 now. It took me a
>
> Jon,
>
> great stuff. Is this published somewhere I can pull it from?
This is a diff relative to your git repo of cvs2svn
>
> cheers,
>
>
> martin
>
--
Jon Smirl
jonsmirl@gmail.com
[-- Attachment #2: delta.patch --]
[-- Type: text/x-patch, Size: 17974 bytes --]
diff --git a/cvs2svn_lib/collect_data.py b/cvs2svn_lib/collect_data.py
index c05e387..364582d 100644
--- a/cvs2svn_lib/collect_data.py
+++ b/cvs2svn_lib/collect_data.py
@@ -24,6 +24,10 @@ import os
import re
import time
import stat
+import sha
+import zlib
+import struct
+from subprocess import Popen,PIPE
from cvs2svn_lib.boolean import *
from cvs2svn_lib.set_support import *
@@ -52,7 +56,6 @@ from cvs2svn_lib.metadata_database impor
import cvs2svn_rcsparse
-
branch_tag_re = re.compile(r'''
^
((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot
@@ -70,6 +73,10 @@ # assuming that the non-standard vendor
# default branch anyway, so we don't want this to match them anyway.
vendor_revision = re.compile(r'^1\.1\.1\.\d+$')
+# Used to parse revision deltas which either add or delete text
+# format is add/delete start range
+deltaPattern = re.compile(r'^([ad])(\d+)\D+(\d+)$')
+
def is_trunk_revision(rev):
"""Return True iff REV is a trunk revision."""
@@ -413,6 +420,14 @@ class _FileDataCollector(cvs2svn_rcspars
pass
+ def set_head_revision(self, revision):
+ """This is a callback method declared in Sink."""
+
+ self.git_head = revision
+ self.git_branches = {}
+ self.git_next = {}
+ self.git_text = {}
+
def define_revision(self, revision, timestamp, author, state,
branches, next):
"""This is a callback method declared in Sink."""
@@ -460,6 +475,11 @@ class _FileDataCollector(cvs2svn_rcspars
self._primary_dependencies.append( (next, revision,) )
else:
self._primary_dependencies.append( (revision, next,) )
+
+ # record the CVS diff structure, git code will build the revisions
+ self.git_branches[revision] = branches
+ self.git_next[revision] = next
+
def _resolve_dependencies(self):
"""Store the primary and branch dependencies into the rev_data objects."""
@@ -476,7 +496,14 @@ class _FileDataCollector(cvs2svn_rcspars
for branch_data in self.sdc.branches_data.values():
# The branch_data's parent has the branch as a child regardless
# of whether the branch had any subsequent commits:
- parent_data = self._rev_data[branch_data.parent]
+ try:
+ parent_data = self._rev_data[branch_data.parent]
+ except KeyError:
+ sys.stderr.write("%s: in '%s':\n"
+ " Missing revision %s\n"
+ % (warning_prefix,
+ self.cvs_file.filename, branch_data.parent))
+ continue
parent_data.branches_data.append(branch_data)
if not Ctx().trunk_only and parent_data.child is not None:
@@ -497,7 +524,14 @@ class _FileDataCollector(cvs2svn_rcspars
for tag_data_list in self.sdc.tags_data.values():
for tag_data in tag_data_list:
# The tag_data's rev has the tag as a child:
- parent_data = self._rev_data[tag_data.rev]
+ try:
+ parent_data = self._rev_data[tag_data.rev]
+ except KeyError:
+ sys.stderr.write("%s: in '%s':\n"
+ " Missing revision %s\n"
+ % (warning_prefix,
+ self.cvs_file.filename, tag_data.rev))
+ continue
parent_data.tags_data.append(tag_data)
if not Ctx().trunk_only and parent_data.child is not None:
@@ -708,12 +742,14 @@ class _FileDataCollector(cvs2svn_rcspars
self._get_rev_id(rev_data.child),
self._determine_operation(rev_data),
revision,
- bool(text),
+ bool(text), "",
lod,
rev_data.is_first_on_branch(),
tag_ids, branch_ids, closed_symbol_ids)
rev_data.c_rev = c_rev
- self.collect_data.add_cvs_revision(c_rev)
+
+ # record the CVS tree so that git can build the revisions
+ self.git_text[revision] = text
def parse_completed(self):
"""Walk through all branches and tags and register them with their
@@ -725,6 +761,66 @@ class _FileDataCollector(cvs2svn_rcspars
self.sdc.register_branch_blockers()
+ def git_write_file(revision, text):
+ header = 'blob ' + str(len(text)) + '\0'
+ sha1 = sha.new(header)
+ sha1.update(text)
+ digest = sha1.digest()
+ rev_data = self._rev_data[revision]
+ rev_data.c_rev.sha = sha1.hexdigest()
+ self.collect_data.add_cvs_revision(rev_data.c_rev)
+ print rev_data.c_rev.id, rev_data.c_rev.sha
+
+ if digest not in self.collect_data.object_names:
+ self.collect_data.object_names[digest] = ""
+ self.collect_data.fimport.stdin.write(struct.pack("l",len(text)))
+ self.collect_data.fimport.stdin.write(text)
+
+ def git_process_diffs(lines, deltas):
+ stack = []
+ delta = 0
+ while delta < len(deltas):
+ ops = deltaPattern.search(deltas[delta]).groups()
+
+ delta += 1
+ x = int(ops[1]) - 1
+ y = int(ops[2])
+ stack.append([ops[0], x, y, delta])
+
+ if ops[0] == 'a':
+ delta += y
+
+ while stack != []:
+ ops = stack.pop()
+ if ops[0] == 'd':
+ lines[ops[1] : ops[1] + ops[2]] = {}
+ elif ops[0] == 'a':
+ lines[ops[1] + 1: ops[1] + 1] = deltas[ops[3] : ops[3] + ops[2]]
+
+ def git_process_revs(lines, revision):
+
+ while revision:
+ deltas = self.git_text[revision].split('\n')
+ deltas.pop()
+ if len(deltas) > 0:
+ git_process_diffs(lines, deltas)
+
+ git_write_file(revision, ''.join(lines))
+
+ if len(self.git_branches[revision]):
+ for branch in self.git_branches[revision]:
+ git_process_revs(lines[:], branch)
+
+ revision = self.git_next[revision]
+
+ revision = self.git_head
+
+ git_write_file(revision, self.git_text[revision])
+
+ lines = self.git_text[revision].splitlines(True)
+ revision = self.git_next[revision]
+ if revision:
+ git_process_revs(lines, revision)
ctrl_characters_regexp = re.compile('[\\\x00-\\\x1f\\\x7f]')
@@ -746,8 +842,14 @@ class _ProjectDataCollector:
self.found_valid_file = False
self.fatal_errors = []
self.num_files = 0
+ collect_data.fimport = Popen(['git-fast-import', 'testme', '760000'], stdin = PIPE)
+ collect_data.object_names = {}
+
os.path.walk(self.project.project_cvs_repos_path,
_ProjectDataCollector._visit_directory, self)
+ collect_data.fimport.stdin.close()
+ collect_data.object_names = {}
+
if not self.fatal_errors and not self.found_valid_file:
self.fatal_errors.append(
'\n'
@@ -783,6 +885,7 @@ class _ProjectDataCollector:
self.num_files += 1
def _visit_directory(self, dirname, files):
+
for fname in files:
verify_filename_legal(fname)
if not fname.endswith(',v'):
@@ -793,7 +896,6 @@ class _ProjectDataCollector:
self._process_file(pathname)
-
class CollectData:
"""Repository for data collected by parsing the CVS repository files.
@@ -814,6 +916,7 @@ class CollectData:
self.num_files = 0
self.symbol_stats = SymbolStatisticsCollector()
self.stats_keeper = stats_keeper
+ self.object_names = []
# Key generator to generate unique keys for each CVSRevision object:
self.key_generator = KeyGenerator()
@@ -836,5 +939,3 @@ class CollectData:
def write_symbol_stats(self):
self.symbol_stats.write()
-
-
diff --git a/cvs2svn_lib/cvs_item.py b/cvs2svn_lib/cvs_item.py
index beabd7c..995fb76 100644
--- a/cvs2svn_lib/cvs_item.py
+++ b/cvs2svn_lib/cvs_item.py
@@ -46,7 +46,7 @@ class CVSRevision(CVSItem):
id, cvs_file,
timestamp, metadata_id,
prev_id, next_id,
- op, rev, deltatext_exists,
+ op, rev, deltatext_exists, sha,
lod, first_on_branch,
tag_ids, branch_ids, closed_symbol_ids):
"""Initialize a new CVSRevision object.
@@ -61,6 +61,7 @@ class CVSRevision(CVSItem):
OP --> (char) OP_ADD, OP_CHANGE, or OP_DELETE
REV --> (string) this CVS rev, e.g., '1.3'
DELTATEXT_EXISTS--> (bool) true iff non-empty deltatext
+ SHA --> sha1 of git revision
LOD --> (LineOfDevelopment) LOD where this rev occurred
FIRST_ON_BRANCH --> (bool) true iff the first rev on its branch
TAG_IDS --> (list of int) ids of all tags on this revision
@@ -79,6 +80,7 @@ class CVSRevision(CVSItem):
self.prev_id = prev_id
self.next_id = next_id
self.deltatext_exists = deltatext_exists
+ self.sha = sha
self.lod = lod
self.first_on_branch = first_on_branch
self.tag_ids = tag_ids
@@ -113,6 +115,7 @@ class CVSRevision(CVSItem):
self.op,
self.rev,
self.deltatext_exists,
+ self.sha,
lod_id,
self.first_on_branch,
' '.join(['%x' % id for id in self.tag_ids]),
@@ -122,7 +125,7 @@ class CVSRevision(CVSItem):
def __setstate__(self, data):
(self.id, cvs_file_id, self.timestamp, self.metadata_id,
self.prev_id, self.next_id, self.op, self.rev,
- self.deltatext_exists, lod_id, self.first_on_branch,
+ self.deltatext_exists, self.sha, lod_id, self.first_on_branch,
tag_ids, branch_ids, closed_symbol_ids) = data
self.cvs_file = Ctx()._cvs_file_db.get_file(cvs_file_id)
if lod_id is None:
diff --git a/cvs2svn_lib/dumpfile_delegate.py b/cvs2svn_lib/dumpfile_delegate.py
index fb0606f..03a06d4 100644
--- a/cvs2svn_lib/dumpfile_delegate.py
+++ b/cvs2svn_lib/dumpfile_delegate.py
@@ -232,67 +232,20 @@ class DumpfileDelegate(SVNRepositoryMirr
# If the file has keywords, we must prevent CVS/RCS from expanding
# the keywords because they must be unexpanded in the repository,
# or Subversion will get confused.
- pipe_cmd, pipe = Ctx().project.cvs_repository.get_co_pipe(
- c_rev, suppress_keyword_substitution=s_item.has_keywords)
+ #pipe_cmd, pipe = Ctx().project.cvs_repository.get_co_pipe(
+ # c_rev, suppress_keyword_substitution=s_item.has_keywords)
self.dumpfile.write('Node-path: %s\n'
'Node-kind: file\n'
'Node-action: %s\n'
'%s' # no property header if no props
- 'Text-content-length: '
% (self._utf8_path(c_rev.svn_path),
action, props_header))
- pos = self.dumpfile.tell()
-
- self.dumpfile.write('0000000000000000\n'
- 'Text-content-md5: 00000000000000000000000000000000\n'
- 'Content-length: 0000000000000000\n'
- '\n')
-
if prop_contents:
self.dumpfile.write(prop_contents)
- # Insert a filter to convert all EOLs to LFs if neccessary
- if s_item.needs_eol_filter:
- data_reader = LF_EOL_Filter(pipe.stdout)
- else:
- data_reader = pipe.stdout
-
- # Insert the rev contents, calculating length and checksum as we go.
- checksum = md5.new()
- length = 0
- while True:
- buf = data_reader.read(config.PIPE_READ_SIZE)
- if buf == '':
- break
- checksum.update(buf)
- length += len(buf)
- self.dumpfile.write(buf)
-
- pipe.stdout.close()
- error_output = pipe.stderr.read()
- exit_status = pipe.wait()
- if exit_status:
- raise FatalError("The command '%s' failed with exit status: %s\n"
- "and the following output:\n"
- "%s" % (pipe_cmd, exit_status, error_output))
-
- # Go back to patch up the length and checksum headers:
- self.dumpfile.seek(pos, 0)
- # We left 16 zeros for the text length; replace them with the real
- # length, padded on the left with spaces:
- self.dumpfile.write('%16d' % length)
- # 16... + 1 newline + len('Text-content-md5: ') == 35
- self.dumpfile.seek(pos + 35, 0)
- self.dumpfile.write(checksum.hexdigest())
- # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
- self.dumpfile.seek(pos + 84, 0)
- # The content length is the length of property data, text data,
- # and any metadata around/inside around them.
- self.dumpfile.write('%16d' % (length + len(prop_contents)))
- # Jump back to the end of the stream
- self.dumpfile.seek(0, 2)
+ self.dumpfile.write('Git-sha1: %s\n' % (c_rev.sha))
# This record is done (write two newlines -- one to terminate
# contents that weren't themselves newline-termination, one to
diff --git a/cvs2svn_rcsparse/debug.py b/cvs2svn_rcsparse/debug.py
index cfeaf2b..c2d143d 100644
--- a/cvs2svn_rcsparse/debug.py
+++ b/cvs2svn_rcsparse/debug.py
@@ -1,22 +1,113 @@
-# -*-python-*-
#
-# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved.
+# Copyright (C) 2000-2002 The ViewCVS Group. All Rights Reserved.
#
# By using this file, you agree to the terms and conditions set forth in
-# the LICENSE.html file which can be found at the top level of the ViewVC
-# distribution or at http://viewvc.org/license-1.html.
+# the LICENSE.html file which can be found at the top level of the ViewCVS
+# distribution or at http://viewcvs.sourceforge.net/license-1.html.
#
-# For more information, visit http://viewvc.org/
+# Contact information:
+# Greg Stein, PO Box 760, Palo Alto, CA, 94302
+# gstein@lyra.org, http://viewcvs.sourceforge.net/
+#
+# -----------------------------------------------------------------------
+#
+# This software is being maintained as part of the ViewCVS project.
+# Information is available at:
+# http://viewcvs.sourceforge.net/
#
# -----------------------------------------------------------------------
"""debug.py: various debugging tools for the rcsparse package."""
import time
+import re
+import sha
+import zlib
+import os
+import struct
+from subprocess import Popen,PIPE
from __init__ import parse
import common
+deltaPattern = re.compile(r'^([ad])(\d+)\D+(\d+)$')
+
+class RevisionsSink(common.Sink):
+
+ def __init__(self, fimport):
+ self.fimport = fimport
+
+ def set_head_revision(self, revision):
+ self.git_head = revision
+ self.git_branches = {}
+ self.git_next = {}
+ self.git_text = {}
+
+ def define_revision(self, revision, timestamp, author, state,
+ branches, next):
+ self.git_branches[revision] = branches
+ self.git_next[revision] = next
+
+ def set_revision_info(self, revision, log, text):
+ self.git_text[revision] = text
+
+ def parse_completed(self):
+
+ def write_file(text):
+ header = 'blob ' + str(len(text)) + '\0'
+ sha1 = sha.new(header)
+ sha1.update(text)
+ name = sha1.hexdigest()
+
+ print 'length is ', len(text)
+ self.fimport.stdin.write(struct.pack("l",len(text)))
+ self.fimport.stdin.write(text)
+
+ def process_diffs(lines, deltas):
+ stack = []
+ delta = 0
+ while delta < len(deltas):
+ ops = deltaPattern.search(deltas[delta]).groups()
+
+ delta += 1
+ x = int(ops[1]) - 1
+ y = int(ops[2])
+ stack.append([ops[0], x, y, delta])
+
+ if ops[0] == 'a':
+ delta += y
+
+ while stack != []:
+ ops = stack.pop()
+ if ops[0] == 'd':
+ lines[ops[1] : ops[1] + ops[2]] = {}
+ elif ops[0] == 'a':
+ lines[ops[1] + 1: ops[1] + 1] = deltas[ops[3] : ops[3] + ops[2]]
+
+ def process_revs(lines, revision):
+
+ while revision:
+ deltas = self.git_text[revision].split('\n')
+ deltas.pop()
+ if len(deltas) > 0:
+ process_diffs(lines, deltas)
+
+ write_file(''.join(lines))
+
+ if len(self.git_branches[revision]):
+ for branch in self.git_branches[revision]:
+ process_revs(lines[:], branch)
+
+ revision = self.git_next[revision]
+
+ revision = self.git_head
+
+ write_file(self.git_text[revision])
+
+ lines = self.git_text[revision].splitlines(True)
+ revision = self.git_next[revision]
+ if revision:
+ process_revs(lines, revision)
class DebugSink(common.Sink):
def set_head_revision(self, revision):
@@ -46,7 +137,7 @@ class DebugSink(common.Sink):
def set_revision_info(self, revision, log, text):
print 'revision:', revision
print ' log:', log
- print ' text:', text[:100], '...'
+ print ' text:', text
class DumpSink(common.Sink):
@@ -90,9 +181,17 @@ class DumpSink(common.Sink):
print 'parse_completed'
+def debug_file(fname):
+ parse(open(fname, 'rb'), DebugSink())
+
def dump_file(fname):
parse(open(fname, 'rb'), DumpSink())
+def revisions_file(fname):
+ fimport = Popen(['git-fast-import', 'testme'], stdin = PIPE)
+ parse(open(fname, 'rb'), RevisionsSink(fimport))
+ fimport.stdin.close()
+
def time_file(fname):
f = open(fname, 'rb')
s = common.Sink()
@@ -116,7 +215,11 @@ if __name__ == '__main__':
_usage()
if sys.argv[1] == 'dump':
dump_file(sys.argv[2])
+ elif sys.argv[1] == 'debug':
+ debug_file(sys.argv[2])
elif sys.argv[1] == 'time':
time_file(sys.argv[2])
+ elif sys.argv[1] == 'revisions':
+ revisions_file(sys.argv[2])
else:
_usage()
diff --git a/cvs2svn_rcsparse/default.py b/cvs2svn_rcsparse/default.py
index 14c9958..4e108f4 100644
--- a/cvs2svn_rcsparse/default.py
+++ b/cvs2svn_rcsparse/default.py
@@ -24,7 +24,7 @@ class _TokenStream:
# the algorithm is about the same speed for any CHUNK_SIZE chosen.
# grab a good-sized chunk, but not too large to overwhelm memory.
# note: we use a multiple of a standard block size
- CHUNK_SIZE = 192 * 512 # about 100k
+ CHUNK_SIZE = 4096 * 512 # about 2MB
# CHUNK_SIZE = 5 # for debugging, make the function grind...
^ permalink raw reply related [flat|nested] 4+ messages in thread