From: Luke Diamand <luke@diamand.org>
To: git@vger.kernel.org
Cc: Luke Diamand <luke@diamand.org>
Subject: [PATCH/RFC v2] git-p4: stream from perforce to speed up clones
Date: Sat, 11 Jul 2009 09:31:11 +0100 [thread overview]
Message-ID: <1247301071-9541-1-git-send-email-luke@diamand.org> (raw)
Change commit() to stream data from Perforce and into fast-import
rather than reading into memory first, and then writing out. This
hugely reduces the memory requirements when cloning non-incrementally.
Signed-off-by: Luke Diamand <luke@diamand.org>
---
I've modified git-p4 so that it streams/pipes data into fast-import rather
that reading everything into memory first. The old scheme meant that
for a large repository (mine is around 2G) my PC just grinds to a
halt and never actually finishes. With this change it takes around ten
minutes.
This is a resend of a patch I sent earlier, which my MUA helpfully
managed to word-wrap.
contrib/fast-import/git-p4 | 164 ++++++++++++++++++++++++++++++++++++--------
1 files changed, 136 insertions(+), 28 deletions(-)
diff --git a/contrib/fast-import/git-p4 b/contrib/fast-import/git-p4
index 342529d..f415ad0 100755
--- a/contrib/fast-import/git-p4
+++ b/contrib/fast-import/git-p4
@@ -1008,6 +1008,141 @@ class P4Sync(Command):
return filesForCommit
+ # output one file from the P4 stream
+ # - helper for streamP4Files
+
+ def streamOneP4File(self, file, contents, branchPrefixes):
+ if verbose:
+ sys.stderr.write("%s\n" % file["depotFile"])
+
+ relPath = self.stripRepoPath(file['depotFile'], branchPrefixes)
+
+ mode = "644"
+ if isP4Exec(file["type"]):
+ mode = "755"
+ elif file["type"] == "symlink":
+ mode = "120000"
+ # p4 print on a symlink contains "target\n", so strip it off
+ last = contents.pop()
+ last = last[:-1]
+ contents.append(last)
+
+ if self.isWindows and file["type"].endswith("text"):
+ mangled = []
+ for data in contents:
+ data = data.replace("\r\n", "\n")
+ mangled.append(data)
+ contents = mangled
+
+ if file['type'] in ('text+ko', 'unicode+ko', 'binary+ko'):
+ contents = map(lambda text: re.sub(r'(?i)\$(Id|Header):[^$]*\$',r'$\1$', text), contents)
+ elif file['type'] in ('text+k', 'ktext', 'kxtext', 'unicode+k', 'binary+k'):
+ contents = map(lambda text: re.sub(r'\$(Id|Header|Author|Date|DateTime|Change|File|Revision):[^$\n]*\$',r'$\1$', text), contents)
+
+ self.gitStream.write("M %s inline %s\n" % (mode, relPath))
+
+ # total length...
+ length = 0
+ for d in contents:
+ length = length + len(d)
+
+ self.gitStream.write("data %d\n" % length)
+ for d in contents:
+ self.gitStream.write(d)
+ self.gitStream.write("\n")
+
+ def streamOneP4Deletion(self, file, branchPrefixes):
+ if verbose:
+ sys.stderr.write("delete %s\n" % file["path"])
+
+ relPath = self.stripRepoPath(file['path'], branchPrefixes)
+
+ self.gitStream.write("D %s\n" % relPath)
+
+ # Stream directly from "p4 files" into "git fast-import"
+ def streamP4Files(self, files, branchPrefixes):
+ filesForCommit = []
+ filesToRead = []
+ filesToDelete = []
+
+ for f in files:
+ includeFile = True
+ for val in self.clientSpecDirs:
+ if f['path'].startswith(val[0]):
+ if val[1] <= 0:
+ includeFile = False
+ break
+
+ if includeFile:
+ filesForCommit.append(f)
+ if f['action'] not in ('delete', 'purge'):
+ filesToRead.append(f)
+ else:
+ filesToDelete.append(f)
+
+ filedata = []
+
+ # deleted files...
+ for f in filesToDelete:
+ self.streamOneP4Deletion(f, branchPrefixes)
+
+ if len(filesToRead) > 0:
+ stdin_file = tempfile.TemporaryFile(prefix='p4-stdin', mode='w+b')
+ stdin_file.write('\n'.join(['%s#%s' % (f['path'], f['rev'])
+ for f in filesToRead]))
+ stdin_file.flush()
+ stdin_file.seek(0)
+ try:
+ p4 = subprocess.Popen('p4 -G -x - print',
+ shell=True,
+ stdin=stdin_file,
+ stdout=subprocess.PIPE);
+ except OSError,e:
+ print >> sys.stderr, "p4 print failed:", e
+
+ file = {}
+ contents = []
+ have_file_info = False
+
+ try:
+ while True:
+ marshalled = marshal.load(p4.stdout)
+
+ if marshalled.has_key('depotFile') and have_file_info:
+ # start of a new file - output the old one first
+
+ if file["type"] == "apple":
+ print "\nfile %s is a strange apple file that forks. Ignoring" % file['path']
+ continue
+
+
+ self.streamOneP4File(file,contents,branchPrefixes)
+ file = {}
+ contents = []
+ have_file_info = False
+
+ # pick up the new file information... for the
+ # 'data' field we need to append to our array
+ for k in marshalled.keys():
+ if k == 'data':
+ contents.append(marshalled['data'])
+ else:
+ file[k] = marshalled[k]
+
+ have_file_info = True
+ except EOFError:
+ pass
+
+ # do the last chunk
+
+ if file.has_key('depotFile'):
+ self.streamOneP4File(file,contents,branchPrefixes)
+
+ exitCode = p4.wait()
+ if exitCode != 0:
+ sys.stderr.write("p4 subshell failed getting file data\n")
+ sys.exit(1)
+
def commit(self, details, files, branch, branchPrefixes, parent = ""):
epoch = details["time"]
author = details["user"]
@@ -1023,7 +1158,6 @@ class P4Sync(Command):
new_files.append (f)
else:
sys.stderr.write("Ignoring file outside of prefix: %s\n" % path)
- files = self.readP4Files(new_files)
self.gitStream.write("commit %s\n" % branch)
# gitStream.write("mark :%s\n" % details["change"])
@@ -1051,33 +1185,7 @@ class P4Sync(Command):
print "parent %s" % parent
self.gitStream.write("from %s\n" % parent)
- for file in files:
- if file["type"] == "apple":
- print "\nfile %s is a strange apple file that forks. Ignoring!" % file['path']
- continue
-
- relPath = self.stripRepoPath(file['path'], branchPrefixes)
- if file["action"] in ("delete", "purge"):
- self.gitStream.write("D %s\n" % relPath)
- else:
- data = file['data']
-
- mode = "644"
- if isP4Exec(file["type"]):
- mode = "755"
- elif file["type"] == "symlink":
- mode = "120000"
- # p4 print on a symlink contains "target\n", so strip it off
- data = data[:-1]
-
- if self.isWindows and file["type"].endswith("text"):
- data = data.replace("\r\n", "\n")
-
- self.gitStream.write("M %s inline %s\n" % (mode, relPath))
- self.gitStream.write("data %s\n" % len(data))
- self.gitStream.write(data)
- self.gitStream.write("\n")
-
+ self.streamP4Files(new_files,branchPrefixes)
self.gitStream.write("\n")
change = int(details["change"])
--
1.6.3.GIT
next reply other threads:[~2009-07-11 8:31 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-07-11 8:31 Luke Diamand [this message]
2009-07-25 14:23 ` [PATCH/RFC v2] git-p4: stream from perforce to speed up clones Pete Wyckoff
2009-07-25 14:23 ` [PATCH 1/5] git-p4 stream: remove unused function Pete Wyckoff
2009-07-25 14:24 ` [PATCH 2/5] git-p4 stream: do not pass branchPrefixes so much Pete Wyckoff
2009-07-25 14:24 ` [PATCH 3/5] git-p4 stream: show relative path in debug messages Pete Wyckoff
2009-07-25 14:24 ` [PATCH 4/5] git-p4 stream: check apple file type Pete Wyckoff
2009-07-25 14:25 ` [PATCH 5/5] git-p4 stream: use existing p4CmdList with callback Pete Wyckoff
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1247301071-9541-1-git-send-email-luke@diamand.org \
--to=luke@diamand.org \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).