All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH/RFC v2] git-p4: stream from perforce to speed up clones
@ 2009-07-11  8:31 Luke Diamand
  2009-07-25 14:23 ` Pete Wyckoff
  0 siblings, 1 reply; 7+ messages in thread
From: Luke Diamand @ 2009-07-11  8:31 UTC (permalink / raw)
  To: git; +Cc: Luke Diamand

Change commit() to stream data from Perforce and into fast-import
rather than reading into memory first, and then writing out. This
hugely reduces the memory requirements when cloning non-incrementally.

Signed-off-by: Luke Diamand <luke@diamand.org>
---
I've modified git-p4 so that it streams/pipes data into fast-import rather
that reading everything into memory first. The old scheme meant that
for a large repository (mine is around 2G) my PC just grinds to a
halt and never actually finishes. With this change it takes around ten
minutes.

This is a resend of a patch I sent earlier, which my MUA helpfully
managed to word-wrap.

 contrib/fast-import/git-p4 |  164 ++++++++++++++++++++++++++++++++++++--------
 1 files changed, 136 insertions(+), 28 deletions(-)

diff --git a/contrib/fast-import/git-p4 b/contrib/fast-import/git-p4
index 342529d..f415ad0 100755
--- a/contrib/fast-import/git-p4
+++ b/contrib/fast-import/git-p4
@@ -1008,6 +1008,141 @@ class P4Sync(Command):
 
         return filesForCommit
 
+    # output one file from the P4 stream
+    # - helper for streamP4Files
+
+    def streamOneP4File(self, file, contents, branchPrefixes):
+        if verbose:
+            sys.stderr.write("%s\n" %  file["depotFile"])
+
+        relPath = self.stripRepoPath(file['depotFile'], branchPrefixes)
+
+        mode = "644"
+        if isP4Exec(file["type"]):
+            mode = "755"
+        elif file["type"] == "symlink":
+            mode = "120000"
+            # p4 print on a symlink contains "target\n", so strip it off
+            last = contents.pop()
+            last = last[:-1]
+            contents.append(last)
+
+        if self.isWindows and file["type"].endswith("text"):
+            mangled = []
+            for data in contents:
+                data = data.replace("\r\n", "\n")
+                mangled.append(data)
+            contents = mangled
+
+        if file['type'] in ('text+ko', 'unicode+ko', 'binary+ko'):
+            contents = map(lambda text: re.sub(r'(?i)\$(Id|Header):[^$]*\$',r'$\1$', text), contents)
+        elif file['type'] in ('text+k', 'ktext', 'kxtext', 'unicode+k', 'binary+k'):
+            contents = map(lambda text: re.sub(r'\$(Id|Header|Author|Date|DateTime|Change|File|Revision):[^$\n]*\$',r'$\1$', text), contents)
+
+        self.gitStream.write("M %s inline %s\n" % (mode, relPath))
+
+        # total length...
+        length = 0
+        for d in contents:
+            length = length + len(d)
+
+        self.gitStream.write("data %d\n" % length)
+        for d in contents:
+            self.gitStream.write(d)
+        self.gitStream.write("\n")
+
+    def streamOneP4Deletion(self, file, branchPrefixes):
+        if verbose:
+            sys.stderr.write("delete %s\n" %  file["path"])
+
+        relPath = self.stripRepoPath(file['path'], branchPrefixes)
+
+        self.gitStream.write("D %s\n" % relPath)
+
+    # Stream directly from "p4 files" into "git fast-import"
+    def streamP4Files(self, files, branchPrefixes):
+        filesForCommit = []
+        filesToRead = []
+        filesToDelete = []
+
+        for f in files:
+            includeFile = True
+            for val in self.clientSpecDirs:
+                if f['path'].startswith(val[0]):
+                    if val[1] <= 0:
+                        includeFile = False
+                    break
+
+            if includeFile:
+                filesForCommit.append(f)
+                if f['action'] not in ('delete', 'purge'):
+                    filesToRead.append(f)
+                else:
+                    filesToDelete.append(f)
+
+        filedata = []
+
+        # deleted files...
+        for f in filesToDelete:
+            self.streamOneP4Deletion(f, branchPrefixes)
+
+        if len(filesToRead) > 0:
+            stdin_file = tempfile.TemporaryFile(prefix='p4-stdin', mode='w+b')
+            stdin_file.write('\n'.join(['%s#%s' % (f['path'], f['rev'])
+                                                  for f in filesToRead]))
+            stdin_file.flush()
+            stdin_file.seek(0)
+            try:
+                p4 = subprocess.Popen('p4 -G -x - print',
+                                        shell=True,
+                                        stdin=stdin_file,
+                                        stdout=subprocess.PIPE);
+            except OSError,e:
+                print >> sys.stderr, "p4 print failed:", e
+
+            file = {}
+            contents = []
+            have_file_info = False
+
+            try:
+                while True:
+                    marshalled = marshal.load(p4.stdout)
+
+                    if marshalled.has_key('depotFile') and have_file_info:
+                        # start of a new file - output the old one first
+
+                        if file["type"] == "apple":
+                            print "\nfile %s is a strange apple file that forks. Ignoring" % file['path']
+                            continue
+
+
+                        self.streamOneP4File(file,contents,branchPrefixes)
+                        file = {}
+                        contents = []
+                        have_file_info = False
+
+                    # pick up the new file information... for the
+                    # 'data' field we need to append to our array
+                    for k in marshalled.keys():
+                        if k == 'data':
+                            contents.append(marshalled['data'])
+                        else:
+                            file[k] = marshalled[k]
+
+                    have_file_info = True
+            except EOFError:
+                pass
+
+            # do the last chunk
+
+            if file.has_key('depotFile'):
+                self.streamOneP4File(file,contents,branchPrefixes)
+
+            exitCode = p4.wait()
+            if exitCode != 0:
+                sys.stderr.write("p4 subshell failed getting file data\n")
+                sys.exit(1)
+
     def commit(self, details, files, branch, branchPrefixes, parent = ""):
         epoch = details["time"]
         author = details["user"]
@@ -1023,7 +1158,6 @@ class P4Sync(Command):
                 new_files.append (f)
             else:
                 sys.stderr.write("Ignoring file outside of prefix: %s\n" % path)
-        files = self.readP4Files(new_files)
 
         self.gitStream.write("commit %s\n" % branch)
 #        gitStream.write("mark :%s\n" % details["change"])
@@ -1051,33 +1185,7 @@ class P4Sync(Command):
                 print "parent %s" % parent
             self.gitStream.write("from %s\n" % parent)
 
-        for file in files:
-            if file["type"] == "apple":
-                print "\nfile %s is a strange apple file that forks. Ignoring!" % file['path']
-                continue
-
-            relPath = self.stripRepoPath(file['path'], branchPrefixes)
-            if file["action"] in ("delete", "purge"):
-                self.gitStream.write("D %s\n" % relPath)
-            else:
-                data = file['data']
-
-                mode = "644"
-                if isP4Exec(file["type"]):
-                    mode = "755"
-                elif file["type"] == "symlink":
-                    mode = "120000"
-                    # p4 print on a symlink contains "target\n", so strip it off
-                    data = data[:-1]
-
-                if self.isWindows and file["type"].endswith("text"):
-                    data = data.replace("\r\n", "\n")
-
-                self.gitStream.write("M %s inline %s\n" % (mode, relPath))
-                self.gitStream.write("data %s\n" % len(data))
-                self.gitStream.write(data)
-                self.gitStream.write("\n")
-
+        self.streamP4Files(new_files,branchPrefixes)
         self.gitStream.write("\n")
 
         change = int(details["change"])
-- 
1.6.3.GIT

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2009-07-25 14:33 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-07-11  8:31 [PATCH/RFC v2] git-p4: stream from perforce to speed up clones Luke Diamand
2009-07-25 14:23 ` Pete Wyckoff
2009-07-25 14:23   ` [PATCH 1/5] git-p4 stream: remove unused function Pete Wyckoff
2009-07-25 14:24   ` [PATCH 2/5] git-p4 stream: do not pass branchPrefixes so much Pete Wyckoff
2009-07-25 14:24   ` [PATCH 3/5] git-p4 stream: show relative path in debug messages Pete Wyckoff
2009-07-25 14:24   ` [PATCH 4/5] git-p4 stream: check apple file type Pete Wyckoff
2009-07-25 14:25   ` [PATCH 5/5] git-p4 stream: use existing p4CmdList with callback Pete Wyckoff

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.