* cvs2svn and git progress
@ 2006-08-10 1:29 Jon Smirl
2006-08-10 7:28 ` Martin Langhoff
2006-08-10 20:36 ` Jakub Narebski
0 siblings, 2 replies; 4+ messages in thread
From: Jon Smirl @ 2006-08-10 1:29 UTC (permalink / raw)
To: git, Shawn Pearce
I've finally got cvs2svn running through pass 7 now. It took me a
while to fix errors where it would run five hours and then die with
"key error 357" or something similar.
pass 1: 7663 seconds
pass 2: 1 second
pass 3: 1043 seconds
pass 4: 3 seconds
pass 5: 204 seconds
pass 6: 1165 seconds
pass 7: 37 seconds
Total is a little less than three hours. The new cvs2svn code is many
times faster than the old version which ran for several days. The
total includes the time needed to write the git pack and index file
for the revisions.
Pass 8 is where the final output is generated. Now that I have a
database I can only run pass 8 as needed to work on the back-end code.
I believe I have the sha1 from the git revisions correctly in the
final database but I need to write my first try at dumping the change
sets to be sure.
--
Jon Smirl
jonsmirl@gmail.com
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: cvs2svn and git progress
2006-08-10 1:29 cvs2svn and git progress Jon Smirl
@ 2006-08-10 7:28 ` Martin Langhoff
2006-08-10 13:32 ` Jon Smirl
2006-08-10 20:36 ` Jakub Narebski
1 sibling, 1 reply; 4+ messages in thread
From: Martin Langhoff @ 2006-08-10 7:28 UTC (permalink / raw)
To: Jon Smirl; +Cc: git, Shawn Pearce
On 8/10/06, Jon Smirl <jonsmirl@gmail.com> wrote:
> I've finally got cvs2svn running through pass 7 now. It took me a
Jon,
great stuff. Is this published somewhere I can pull it from?
cheers,
martin
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: cvs2svn and git progress
2006-08-10 7:28 ` Martin Langhoff
@ 2006-08-10 13:32 ` Jon Smirl
0 siblings, 0 replies; 4+ messages in thread
From: Jon Smirl @ 2006-08-10 13:32 UTC (permalink / raw)
To: Martin Langhoff; +Cc: git, Shawn Pearce
[-- Attachment #1: Type: text/plain, Size: 375 bytes --]
On 8/10/06, Martin Langhoff <martin.langhoff@gmail.com> wrote:
> On 8/10/06, Jon Smirl <jonsmirl@gmail.com> wrote:
> > I've finally got cvs2svn running through pass 7 now. It took me a
>
> Jon,
>
> great stuff. Is this published somewhere I can pull it from?
This is a diff relative to your git repo of cvs2svn
>
> cheers,
>
>
> martin
>
--
Jon Smirl
jonsmirl@gmail.com
[-- Attachment #2: delta.patch --]
[-- Type: text/x-patch, Size: 17974 bytes --]
diff --git a/cvs2svn_lib/collect_data.py b/cvs2svn_lib/collect_data.py
index c05e387..364582d 100644
--- a/cvs2svn_lib/collect_data.py
+++ b/cvs2svn_lib/collect_data.py
@@ -24,6 +24,10 @@ import os
import re
import time
import stat
+import sha
+import zlib
+import struct
+from subprocess import Popen,PIPE
from cvs2svn_lib.boolean import *
from cvs2svn_lib.set_support import *
@@ -52,7 +56,6 @@ from cvs2svn_lib.metadata_database impor
import cvs2svn_rcsparse
-
branch_tag_re = re.compile(r'''
^
((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot
@@ -70,6 +73,10 @@ # assuming that the non-standard vendor
# default branch anyway, so we don't want this to match them anyway.
vendor_revision = re.compile(r'^1\.1\.1\.\d+$')
+# Used to parse revision deltas which either add or delete text
+# format is add/delete start range
+deltaPattern = re.compile(r'^([ad])(\d+)\D+(\d+)$')
+
def is_trunk_revision(rev):
"""Return True iff REV is a trunk revision."""
@@ -413,6 +420,14 @@ class _FileDataCollector(cvs2svn_rcspars
pass
+ def set_head_revision(self, revision):
+ """This is a callback method declared in Sink."""
+
+ self.git_head = revision
+ self.git_branches = {}
+ self.git_next = {}
+ self.git_text = {}
+
def define_revision(self, revision, timestamp, author, state,
branches, next):
"""This is a callback method declared in Sink."""
@@ -460,6 +475,11 @@ class _FileDataCollector(cvs2svn_rcspars
self._primary_dependencies.append( (next, revision,) )
else:
self._primary_dependencies.append( (revision, next,) )
+
+ # record the CVS diff structure, git code will build the revisions
+ self.git_branches[revision] = branches
+ self.git_next[revision] = next
+
def _resolve_dependencies(self):
"""Store the primary and branch dependencies into the rev_data objects."""
@@ -476,7 +496,14 @@ class _FileDataCollector(cvs2svn_rcspars
for branch_data in self.sdc.branches_data.values():
# The branch_data's parent has the branch as a child regardless
# of whether the branch had any subsequent commits:
- parent_data = self._rev_data[branch_data.parent]
+ try:
+ parent_data = self._rev_data[branch_data.parent]
+ except KeyError:
+ sys.stderr.write("%s: in '%s':\n"
+ " Missing revision %s\n"
+ % (warning_prefix,
+ self.cvs_file.filename, branch_data.parent))
+ continue
parent_data.branches_data.append(branch_data)
if not Ctx().trunk_only and parent_data.child is not None:
@@ -497,7 +524,14 @@ class _FileDataCollector(cvs2svn_rcspars
for tag_data_list in self.sdc.tags_data.values():
for tag_data in tag_data_list:
# The tag_data's rev has the tag as a child:
- parent_data = self._rev_data[tag_data.rev]
+ try:
+ parent_data = self._rev_data[tag_data.rev]
+ except KeyError:
+ sys.stderr.write("%s: in '%s':\n"
+ " Missing revision %s\n"
+ % (warning_prefix,
+ self.cvs_file.filename, tag_data.rev))
+ continue
parent_data.tags_data.append(tag_data)
if not Ctx().trunk_only and parent_data.child is not None:
@@ -708,12 +742,14 @@ class _FileDataCollector(cvs2svn_rcspars
self._get_rev_id(rev_data.child),
self._determine_operation(rev_data),
revision,
- bool(text),
+ bool(text), "",
lod,
rev_data.is_first_on_branch(),
tag_ids, branch_ids, closed_symbol_ids)
rev_data.c_rev = c_rev
- self.collect_data.add_cvs_revision(c_rev)
+
+ # record the CVS tree so that git can build the revisions
+ self.git_text[revision] = text
def parse_completed(self):
"""Walk through all branches and tags and register them with their
@@ -725,6 +761,66 @@ class _FileDataCollector(cvs2svn_rcspars
self.sdc.register_branch_blockers()
+ def git_write_file(revision, text):
+ header = 'blob ' + str(len(text)) + '\0'
+ sha1 = sha.new(header)
+ sha1.update(text)
+ digest = sha1.digest()
+ rev_data = self._rev_data[revision]
+ rev_data.c_rev.sha = sha1.hexdigest()
+ self.collect_data.add_cvs_revision(rev_data.c_rev)
+ print rev_data.c_rev.id, rev_data.c_rev.sha
+
+ if digest not in self.collect_data.object_names:
+ self.collect_data.object_names[digest] = ""
+ self.collect_data.fimport.stdin.write(struct.pack("l",len(text)))
+ self.collect_data.fimport.stdin.write(text)
+
+ def git_process_diffs(lines, deltas):
+ stack = []
+ delta = 0
+ while delta < len(deltas):
+ ops = deltaPattern.search(deltas[delta]).groups()
+
+ delta += 1
+ x = int(ops[1]) - 1
+ y = int(ops[2])
+ stack.append([ops[0], x, y, delta])
+
+ if ops[0] == 'a':
+ delta += y
+
+ while stack != []:
+ ops = stack.pop()
+ if ops[0] == 'd':
+ lines[ops[1] : ops[1] + ops[2]] = {}
+ elif ops[0] == 'a':
+ lines[ops[1] + 1: ops[1] + 1] = deltas[ops[3] : ops[3] + ops[2]]
+
+ def git_process_revs(lines, revision):
+
+ while revision:
+ deltas = self.git_text[revision].split('\n')
+ deltas.pop()
+ if len(deltas) > 0:
+ git_process_diffs(lines, deltas)
+
+ git_write_file(revision, ''.join(lines))
+
+ if len(self.git_branches[revision]):
+ for branch in self.git_branches[revision]:
+ git_process_revs(lines[:], branch)
+
+ revision = self.git_next[revision]
+
+ revision = self.git_head
+
+ git_write_file(revision, self.git_text[revision])
+
+ lines = self.git_text[revision].splitlines(True)
+ revision = self.git_next[revision]
+ if revision:
+ git_process_revs(lines, revision)
ctrl_characters_regexp = re.compile('[\\\x00-\\\x1f\\\x7f]')
@@ -746,8 +842,14 @@ class _ProjectDataCollector:
self.found_valid_file = False
self.fatal_errors = []
self.num_files = 0
+ collect_data.fimport = Popen(['git-fast-import', 'testme', '760000'], stdin = PIPE)
+ collect_data.object_names = {}
+
os.path.walk(self.project.project_cvs_repos_path,
_ProjectDataCollector._visit_directory, self)
+ collect_data.fimport.stdin.close()
+ collect_data.object_names = {}
+
if not self.fatal_errors and not self.found_valid_file:
self.fatal_errors.append(
'\n'
@@ -783,6 +885,7 @@ class _ProjectDataCollector:
self.num_files += 1
def _visit_directory(self, dirname, files):
+
for fname in files:
verify_filename_legal(fname)
if not fname.endswith(',v'):
@@ -793,7 +896,6 @@ class _ProjectDataCollector:
self._process_file(pathname)
-
class CollectData:
"""Repository for data collected by parsing the CVS repository files.
@@ -814,6 +916,7 @@ class CollectData:
self.num_files = 0
self.symbol_stats = SymbolStatisticsCollector()
self.stats_keeper = stats_keeper
+ self.object_names = []
# Key generator to generate unique keys for each CVSRevision object:
self.key_generator = KeyGenerator()
@@ -836,5 +939,3 @@ class CollectData:
def write_symbol_stats(self):
self.symbol_stats.write()
-
-
diff --git a/cvs2svn_lib/cvs_item.py b/cvs2svn_lib/cvs_item.py
index beabd7c..995fb76 100644
--- a/cvs2svn_lib/cvs_item.py
+++ b/cvs2svn_lib/cvs_item.py
@@ -46,7 +46,7 @@ class CVSRevision(CVSItem):
id, cvs_file,
timestamp, metadata_id,
prev_id, next_id,
- op, rev, deltatext_exists,
+ op, rev, deltatext_exists, sha,
lod, first_on_branch,
tag_ids, branch_ids, closed_symbol_ids):
"""Initialize a new CVSRevision object.
@@ -61,6 +61,7 @@ class CVSRevision(CVSItem):
OP --> (char) OP_ADD, OP_CHANGE, or OP_DELETE
REV --> (string) this CVS rev, e.g., '1.3'
DELTATEXT_EXISTS--> (bool) true iff non-empty deltatext
+ SHA --> sha1 of git revision
LOD --> (LineOfDevelopment) LOD where this rev occurred
FIRST_ON_BRANCH --> (bool) true iff the first rev on its branch
TAG_IDS --> (list of int) ids of all tags on this revision
@@ -79,6 +80,7 @@ class CVSRevision(CVSItem):
self.prev_id = prev_id
self.next_id = next_id
self.deltatext_exists = deltatext_exists
+ self.sha = sha
self.lod = lod
self.first_on_branch = first_on_branch
self.tag_ids = tag_ids
@@ -113,6 +115,7 @@ class CVSRevision(CVSItem):
self.op,
self.rev,
self.deltatext_exists,
+ self.sha,
lod_id,
self.first_on_branch,
' '.join(['%x' % id for id in self.tag_ids]),
@@ -122,7 +125,7 @@ class CVSRevision(CVSItem):
def __setstate__(self, data):
(self.id, cvs_file_id, self.timestamp, self.metadata_id,
self.prev_id, self.next_id, self.op, self.rev,
- self.deltatext_exists, lod_id, self.first_on_branch,
+ self.deltatext_exists, self.sha, lod_id, self.first_on_branch,
tag_ids, branch_ids, closed_symbol_ids) = data
self.cvs_file = Ctx()._cvs_file_db.get_file(cvs_file_id)
if lod_id is None:
diff --git a/cvs2svn_lib/dumpfile_delegate.py b/cvs2svn_lib/dumpfile_delegate.py
index fb0606f..03a06d4 100644
--- a/cvs2svn_lib/dumpfile_delegate.py
+++ b/cvs2svn_lib/dumpfile_delegate.py
@@ -232,67 +232,20 @@ class DumpfileDelegate(SVNRepositoryMirr
# If the file has keywords, we must prevent CVS/RCS from expanding
# the keywords because they must be unexpanded in the repository,
# or Subversion will get confused.
- pipe_cmd, pipe = Ctx().project.cvs_repository.get_co_pipe(
- c_rev, suppress_keyword_substitution=s_item.has_keywords)
+ #pipe_cmd, pipe = Ctx().project.cvs_repository.get_co_pipe(
+ # c_rev, suppress_keyword_substitution=s_item.has_keywords)
self.dumpfile.write('Node-path: %s\n'
'Node-kind: file\n'
'Node-action: %s\n'
'%s' # no property header if no props
- 'Text-content-length: '
% (self._utf8_path(c_rev.svn_path),
action, props_header))
- pos = self.dumpfile.tell()
-
- self.dumpfile.write('0000000000000000\n'
- 'Text-content-md5: 00000000000000000000000000000000\n'
- 'Content-length: 0000000000000000\n'
- '\n')
-
if prop_contents:
self.dumpfile.write(prop_contents)
- # Insert a filter to convert all EOLs to LFs if neccessary
- if s_item.needs_eol_filter:
- data_reader = LF_EOL_Filter(pipe.stdout)
- else:
- data_reader = pipe.stdout
-
- # Insert the rev contents, calculating length and checksum as we go.
- checksum = md5.new()
- length = 0
- while True:
- buf = data_reader.read(config.PIPE_READ_SIZE)
- if buf == '':
- break
- checksum.update(buf)
- length += len(buf)
- self.dumpfile.write(buf)
-
- pipe.stdout.close()
- error_output = pipe.stderr.read()
- exit_status = pipe.wait()
- if exit_status:
- raise FatalError("The command '%s' failed with exit status: %s\n"
- "and the following output:\n"
- "%s" % (pipe_cmd, exit_status, error_output))
-
- # Go back to patch up the length and checksum headers:
- self.dumpfile.seek(pos, 0)
- # We left 16 zeros for the text length; replace them with the real
- # length, padded on the left with spaces:
- self.dumpfile.write('%16d' % length)
- # 16... + 1 newline + len('Text-content-md5: ') == 35
- self.dumpfile.seek(pos + 35, 0)
- self.dumpfile.write(checksum.hexdigest())
- # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
- self.dumpfile.seek(pos + 84, 0)
- # The content length is the length of property data, text data,
- # and any metadata around/inside around them.
- self.dumpfile.write('%16d' % (length + len(prop_contents)))
- # Jump back to the end of the stream
- self.dumpfile.seek(0, 2)
+ self.dumpfile.write('Git-sha1: %s\n' % (c_rev.sha))
# This record is done (write two newlines -- one to terminate
# contents that weren't themselves newline-termination, one to
diff --git a/cvs2svn_rcsparse/debug.py b/cvs2svn_rcsparse/debug.py
index cfeaf2b..c2d143d 100644
--- a/cvs2svn_rcsparse/debug.py
+++ b/cvs2svn_rcsparse/debug.py
@@ -1,22 +1,113 @@
-# -*-python-*-
#
-# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved.
+# Copyright (C) 2000-2002 The ViewCVS Group. All Rights Reserved.
#
# By using this file, you agree to the terms and conditions set forth in
-# the LICENSE.html file which can be found at the top level of the ViewVC
-# distribution or at http://viewvc.org/license-1.html.
+# the LICENSE.html file which can be found at the top level of the ViewCVS
+# distribution or at http://viewcvs.sourceforge.net/license-1.html.
#
-# For more information, visit http://viewvc.org/
+# Contact information:
+# Greg Stein, PO Box 760, Palo Alto, CA, 94302
+# gstein@lyra.org, http://viewcvs.sourceforge.net/
+#
+# -----------------------------------------------------------------------
+#
+# This software is being maintained as part of the ViewCVS project.
+# Information is available at:
+# http://viewcvs.sourceforge.net/
#
# -----------------------------------------------------------------------
"""debug.py: various debugging tools for the rcsparse package."""
import time
+import re
+import sha
+import zlib
+import os
+import struct
+from subprocess import Popen,PIPE
from __init__ import parse
import common
+deltaPattern = re.compile(r'^([ad])(\d+)\D+(\d+)$')
+
+class RevisionsSink(common.Sink):
+
+ def __init__(self, fimport):
+ self.fimport = fimport
+
+ def set_head_revision(self, revision):
+ self.git_head = revision
+ self.git_branches = {}
+ self.git_next = {}
+ self.git_text = {}
+
+ def define_revision(self, revision, timestamp, author, state,
+ branches, next):
+ self.git_branches[revision] = branches
+ self.git_next[revision] = next
+
+ def set_revision_info(self, revision, log, text):
+ self.git_text[revision] = text
+
+ def parse_completed(self):
+
+ def write_file(text):
+ header = 'blob ' + str(len(text)) + '\0'
+ sha1 = sha.new(header)
+ sha1.update(text)
+ name = sha1.hexdigest()
+
+ print 'length is ', len(text)
+ self.fimport.stdin.write(struct.pack("l",len(text)))
+ self.fimport.stdin.write(text)
+
+ def process_diffs(lines, deltas):
+ stack = []
+ delta = 0
+ while delta < len(deltas):
+ ops = deltaPattern.search(deltas[delta]).groups()
+
+ delta += 1
+ x = int(ops[1]) - 1
+ y = int(ops[2])
+ stack.append([ops[0], x, y, delta])
+
+ if ops[0] == 'a':
+ delta += y
+
+ while stack != []:
+ ops = stack.pop()
+ if ops[0] == 'd':
+ lines[ops[1] : ops[1] + ops[2]] = {}
+ elif ops[0] == 'a':
+ lines[ops[1] + 1: ops[1] + 1] = deltas[ops[3] : ops[3] + ops[2]]
+
+ def process_revs(lines, revision):
+
+ while revision:
+ deltas = self.git_text[revision].split('\n')
+ deltas.pop()
+ if len(deltas) > 0:
+ process_diffs(lines, deltas)
+
+ write_file(''.join(lines))
+
+ if len(self.git_branches[revision]):
+ for branch in self.git_branches[revision]:
+ process_revs(lines[:], branch)
+
+ revision = self.git_next[revision]
+
+ revision = self.git_head
+
+ write_file(self.git_text[revision])
+
+ lines = self.git_text[revision].splitlines(True)
+ revision = self.git_next[revision]
+ if revision:
+ process_revs(lines, revision)
class DebugSink(common.Sink):
def set_head_revision(self, revision):
@@ -46,7 +137,7 @@ class DebugSink(common.Sink):
def set_revision_info(self, revision, log, text):
print 'revision:', revision
print ' log:', log
- print ' text:', text[:100], '...'
+ print ' text:', text
class DumpSink(common.Sink):
@@ -90,9 +181,17 @@ class DumpSink(common.Sink):
print 'parse_completed'
+def debug_file(fname):
+ parse(open(fname, 'rb'), DebugSink())
+
def dump_file(fname):
parse(open(fname, 'rb'), DumpSink())
+def revisions_file(fname):
+ fimport = Popen(['git-fast-import', 'testme'], stdin = PIPE)
+ parse(open(fname, 'rb'), RevisionsSink(fimport))
+ fimport.stdin.close()
+
def time_file(fname):
f = open(fname, 'rb')
s = common.Sink()
@@ -116,7 +215,11 @@ if __name__ == '__main__':
_usage()
if sys.argv[1] == 'dump':
dump_file(sys.argv[2])
+ elif sys.argv[1] == 'debug':
+ debug_file(sys.argv[2])
elif sys.argv[1] == 'time':
time_file(sys.argv[2])
+ elif sys.argv[1] == 'revisions':
+ revisions_file(sys.argv[2])
else:
_usage()
diff --git a/cvs2svn_rcsparse/default.py b/cvs2svn_rcsparse/default.py
index 14c9958..4e108f4 100644
--- a/cvs2svn_rcsparse/default.py
+++ b/cvs2svn_rcsparse/default.py
@@ -24,7 +24,7 @@ class _TokenStream:
# the algorithm is about the same speed for any CHUNK_SIZE chosen.
# grab a good-sized chunk, but not too large to overwhelm memory.
# note: we use a multiple of a standard block size
- CHUNK_SIZE = 192 * 512 # about 100k
+ CHUNK_SIZE = 4096 * 512 # about 2MB
# CHUNK_SIZE = 5 # for debugging, make the function grind...
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: cvs2svn and git progress
2006-08-10 1:29 cvs2svn and git progress Jon Smirl
2006-08-10 7:28 ` Martin Langhoff
@ 2006-08-10 20:36 ` Jakub Narebski
1 sibling, 0 replies; 4+ messages in thread
From: Jakub Narebski @ 2006-08-10 20:36 UTC (permalink / raw)
To: git
Could you please put proper info about your cvs2svn (cvs2git?) work on
GitWiki?
http://git.or.cz/gitwiki/InterfacesFrontendsAndTools
Thanks in advance
--
Jakub Narebski
Warsaw, Poland
ShadeHawk on #git
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2006-08-10 20:37 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-10 1:29 cvs2svn and git progress Jon Smirl
2006-08-10 7:28 ` Martin Langhoff
2006-08-10 13:32 ` Jon Smirl
2006-08-10 20:36 ` Jakub Narebski
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).