* Python-based fetch optimizer script for "blame" in Partial Clones (was: Re: Sensible way to see what objects are being fetched just-in-time in a partial clone?)
2024-08-26 19:37 ` Tao Klerks
@ 2024-08-26 20:37 ` Tao Klerks
0 siblings, 0 replies; 4+ messages in thread
From: Tao Klerks @ 2024-08-26 20:37 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
[-- Attachment #1: Type: text/plain, Size: 1016 bytes --]
On Mon, Aug 26, 2024 at 9:37 PM Tao Klerks <tao@klerks.biz> wrote:
>
> On Mon, Aug 26, 2024 at 7:28 PM Junio C Hamano <gitster@pobox.com> wrote:
> >
> >
> > Unlike the diff machinery, blame does not have a prefetch machinery.
> > I am glad that somebody is looking at it.
>
> I will try to "productize" it sufficiently to send here in case it's
> useful to someone, but all I can really offer the community-at-large
> is confirmation that in principle, the approach works as you would
> expect: With some small number of jit-fetches for rename-detection
> during the revision walk(s), and with one blob-prefetch call
> afterwards, "git blame" can be made to run cleanly/quickly in a
> "filter:none" clone even on a file like "git.c", with hundreds of
> revisions.
FWIW, here is the script I ended up with, which seems to work reliably
for me (through renames etc). Obviously I'd love to see this built
into "git blame" itself, but this wrapper might help someone out there
in the meantime.
[-- Attachment #2: git-pblame.py --]
[-- Type: application/octet-stream, Size: 7592 bytes --]
import subprocess
import sys
# Print status updates on stderr, and color them if a tty is detected. Nowadays (2024) even Windows is ANSI-escape-friendly.
terminal_found_ish = sys.stderr.isatty()
def print_info(info):
if terminal_found_ish:
sys.stderr.write("\033[1;30m")
sys.stderr.write(info)
if terminal_found_ish:
sys.stderr.write("\033[0m")
sys.stderr.write('\n')
# Simple command-running wrapper
def run_for_stdout_string(args, allowed_errorcodes=None, hide_stderr=False):
if allowed_errorcodes is None:
allowed_errorcodes = []
process_result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE if hide_stderr else None)
if process_result.returncode not in allowed_errorcodes:
process_result.check_returncode()
return process_result.stdout.decode(encoding='utf8', errors="replace")
# Check for local presence of an object in a partial repo (without immediately trying to fetch it!)
def object_exists_locally(oid):
# strategy discussed/discovered in https://lore.kernel.org/git/CAPMMpoha6rBA-T-7cn3DQT_nbNfknigLTky55x0TEmt4Ay2GRA@mail.gmail.com/
# interestingly, even though we're talking to rev-list about an object that's not a rev at all (it's a blob), it's happy anyway.
try:
run_for_stdout_string(['git', 'rev-list', '--missing=print', '-1', oid], hide_stderr=True)
return True
except:
return False
def chunk_list(list_to_chunk, max_size):
return [list_to_chunk[i:i+max_size] for i in range(0,len(list_to_chunk),max_size)]
# Main logic of script
def prefetch_partialclone_blobs_for_path(target_remote, requested_filename, requested_revision):
if run_for_stdout_string(['git', 'config', '--bool', f'remote.{target_remote}.promisor'], [1]).rstrip() != 'true':
# If not partial clone, nothing to do, just let blame run normally.
return
# internal note: as documented in https://git-scm.com/docs/git-log, "--follow" doesn't work ("on non-linear history").
# Similarly, "--find-renames" doesn't work on a filename-filtered log either (the previous/other name is excluded, so it's just an add)
# Need to iterate manually over name-status results instead, detecting renames explicitly whenever there's an "Add".
files_to_check = { requested_filename }
files_checked = set()
blobs_to_check_for = set()
while len(files_to_check) > len(files_checked):
for file_to_check in set(files_to_check):
if file_to_check in files_checked:
continue
print_info(f"...checking 'git log' for history of file '{file_to_check}' in {requested_revision}...")
log_output = run_for_stdout_string(['git', 'log', '--name-status', '--pretty=%h', requested_revision, '--', file_to_check])
if not (log_output or files_checked):
# direct reference to requested_filename for clarity, even thought we looped on file_to_check which has the same value here
raise Exception(f"The provided filename/path ({requested_filename}) did not match any git history (of the current commit).")
current_revision = None
for log_line in log_output.split('\n'):
if not log_line:
# ignore empty lines in output - they feature between the hash and the "name-status" bit
continue
if '\t' not in log_line:
# if something and no tabs, then must be a commit hash
current_revision = log_line
ls_tree_result = run_for_stdout_string(['git', 'ls-tree', current_revision, '-r', '--', file_to_check])
if ls_tree_result:
blobs_to_check_for.add(ls_tree_result.split('\t')[0].split(' ')[2])
continue
# if tab, this should be a "name-status" line
# PRODUCTIZATION: note - there might be a bug here with filenames that contain linebreaks!! or more generally with double-quoted qualification
file_change_type = log_line[0]
if file_change_type not in ['A', 'R', 'D', 'M', 'T']:
raise Exception(f"Unexpected line found in log output: ({log_line})")
if log_line[0] != 'A':
# we're only interested in 'name-status' output if it's telling us about an addition (which might actually have been a rename)
continue
diff_namestat_result = run_for_stdout_string(['git', 'diff', '--name-status', f'{current_revision}~..{current_revision}'])
for changed_file in diff_namestat_result.split('\n'):
if file_to_check in changed_file and changed_file.startswith('R'):
renamed_files = changed_file.split('\t')[1]
potentially_new_filepath = renamed_files.split(' ')[0]
# PRODUCTIZATION: note - there is probably a bug here with spaces in filenames
files_to_check.add(potentially_new_filepath)
print_info(f"...added {potentially_new_filepath} to filepaths we look for as per rename in revision {current_revision}...")
files_checked.add(file_to_check)
print_info(f"...done checking history of file '{file_to_check}'...")
print_info(f"...done checking history...")
print_info(f"...checking for presence of {len(blobs_to_check_for)} blobs in local repo...")
# in windows running a process per blob will take a while, but still much less than a whole lot of fetch calls of course
blobs_to_fetch = { blob for blob in blobs_to_check_for if not object_exists_locally(blob) }
print_info(f"...done checking blob presence...")
if blobs_to_fetch:
# actually fetch blobs; don't bother capturing output; all the funky args imitate git's own jit-blob-fetches
# batch in sets of 300 blobs for windows happiness (commandline length limits)
print_info(f"...fetching {len(blobs_to_fetch)} blobs in bulk from remote {target_remote}...")
for blobs_chunk in chunk_list(list(blobs_to_fetch), 300):
run_for_stdout_string(['git', '-c', 'fetch.negotiationAlgorithm=noop', 'fetch', target_remote, '--no-tags', '--no-write-fetch-head', '--recurse-submodules=no', '--filter=blob:none', *blobs_chunk])
print_info(f"...done fetching...")
# Main script
def run_script():
if len(sys.argv) < 2:
raise Exception(f"This script ({__file__}) expects at least one argument, the filepath to blame.\n"
"The *last* argument is expected to be the filepath.\n"
"If more than 1 arg is provided, then the first arg is expected to be a revision.\n"
"If more than 2 args are provided, the additional args (in the middle) are passed directly to 'git blame'.\n"
)
requested_filename = sys.argv[-1]
requested_revision = sys.argv[1] if len(sys.argv) > 2 else 'HEAD'
# PRODUCTIZATION: There should be a way to specify the remote to this script even as everything else goes to blame
target_remote = 'origin'
prefetch_partialclone_blobs_for_path(target_remote, requested_filename, requested_revision)
# actually run blame
blame_args = ['git', 'blame', *sys.argv[1:]]
print_info(f"...running blame ({' '.join(blame_args)})...")
process_result = subprocess.run(blame_args)
exit(process_result.returncode)
if __name__ == '__main__':
run_script()
^ permalink raw reply [flat|nested] 4+ messages in thread