From: "Kache Hit" <kache.hit@gmail.com>
To: <git@vger.kernel.org>
Subject: Filter smudge for secret restoration: no disk access?
Date: Sun, 23 Nov 2025 23:39:22 -0800 [thread overview]
Message-ID: <DEGR5XSM0EVG.27IMOKOK1O98Y@gmail.com> (raw)
I was working on a git redaction script that restores working copy
secrets when applied via `.gitattributes` clean/smudge filters, but
encountered `smudge` not having access to the "working file" on disk.
I see it's documented as intended in
https://git-scm.com/docs/gitattributes:
> Note that "%f" is the name of the path that is being worked on.
> Depending on the version that is being filtered, the corresponding
> file on disk may not exist, or may have different contents. So, smudge
> and clean commands should not try to access the file on disk, but only
> act as filters on the content provided to them on standard input.
Any chance there's a way around this or some alternative? Python
implementation below for reference.
And also for my understanding, why _shouldn't_ smudge access disk?
```py
#!/usr/bin/env python3
"""
Git clean/smudge filter for redactions that retains working secrets
If the following is in the repo as `bar/foo_secrets.yml`:
```
foo_token: ##REDACTED##
other: "not secret"
```
The local token won't be overwritten on checkout/restore:
```
foo_token: secret_value
other: "not secret"
```
Setup & example usage:
Save this file in repo root as `git_redact_filter.py`
`.gitattributes`:
```
bar/foo_secrets.yml filter=foo_token
```
`.gitconfig`:
```
[filter "foo_token"]
clean = ./git_redact_filter.py --prefix foo_token:
smudge = ./git_redact_filter.py --prefix foo_token: --smudge %f
```
"""
import inspect
import re
import sys
from argparse import ArgumentParser
from pathlib import Path
from typing import TextIO
REDACTED = '##REDACTED##'
def clean(workfile: TextIO, prefixes: list[str], out=None):
pat = prefix_secret_rgx(prefixes)
for line in workfile.readlines():
if match := pat.match(line):
print(match['prefix'] + REDACTED, file=out)
else:
print(line, end='', file=out)
def smudge(repofile: TextIO, prefixes: list[str], path: Path, out=None):
pat = prefix_secret_rgx(prefixes)
with path.open() as workfile: # fails: FileNotFoundError
secrets = {
str(match['prefix']): match
for match in map(pat.match, workfile.readlines())
if match
}
for line in repofile.readlines():
match = pat.match(line)
secret = match and secrets.get(match['prefix'])
if match and secret and match['secret'] == REDACTED:
print(match['prefix'] + secret['secret'], file=out)
else:
print(line, end='', file=out)
def prefix_secret_rgx(prefixes_unsafe: list[str]):
keys = '|'.join(map(re.escape, prefixes_unsafe))
pat = rf"(?P<prefix>\s*({keys})\s*)(?P<secret>.*)"
return re.compile(pat if keys else r'$^')
def heredoc(s: str):
return inspect.cleandoc(s) + '\n'
def main():
desc = "Git clean/smudge filter for redactions"
list_arg = {'action': 'append', 'default': []}
parser = ArgumentParser(description=desc)
parser.add_argument('-p', '--prefix', **list_arg, metavar='PREFIX')
parser.add_argument('--smudge', type=Path, metavar='PATH')
args = parser.parse_args()
if args.smudge:
return smudge(sys.stdin, args.prefix, args.smudge)
else:
return clean(sys.stdin, args.prefix)
if __name__ == '__main__':
sys.exit(main())
import io
from unittest.mock import Mock
import pytest
from pytest import CaptureFixture
work_file = io.StringIO(heredoc("""
foo_token: secret_value
other: "not secret"
"""))
clean_file = io.StringIO(heredoc("""
foo_token: ##REDACTED##
other: "not secret"
"""))
empty_file = io.StringIO()
work_file_secret_removed = io.StringIO(heredoc("""
other: "not secret"
"""))
work_file_lines_added = io.StringIO(heredoc("""
new_other: 123
foo_token: secret_value
other: "not secret"
"""))
def test_clean(capsys: CaptureFixture):
clean(work_file, ['foo_token:'])
captured = capsys.readouterr()
assert captured.out == clean_file.getvalue(), "should be redacted"
def test_clean_idempotent():
out, out2 = io.StringIO(), io.StringIO()
clean(work_file, ['foo_token:'], out)
clean(io.StringIO(out.getvalue()), ['foo_token:'], out2)
assert out2.getvalue() == clean_file.getvalue()
@pytest.mark.parametrize(['workfile', 'expected', 'msg'], [
(work_file, work_file, "secrets should be kept"),
(work_file_lines_added, work_file, "should retain secret"),
(work_file_secret_removed, clean_file, "should restore redacted"),
])
def test_smudge_goal(capsys: CaptureFixture, workfile, expected, msg):
path = Mock()
path.open.side_effect = lambda: io.StringIO(workfile.getvalue())
smudge(clean_file, ['foo_token:'], path)
captured = capsys.readouterr()
assert captured.out == expected.getvalue(), msg
def test_smudge_idempotent():
path = Mock()
path.open.side_effect = lambda: io.StringIO(work_file.getvalue())
cleaned, cleaned2 = io.StringIO(), io.StringIO()
smudge(clean_file, ['foo_token:'], path, cleaned)
cleaned.seek(0)
smudge(cleaned, ['foo_token:'], path, cleaned2)
assert cleaned.getvalue() == cleaned2.getvalue()
git_doc_url = "https://git-scm.com/docs/gitattributes"
@pytest.mark.xfail(reason=f"should access file on disk: {git_doc_url}")
@pytest.mark.parametrize(['workfile', 'expected', 'msg'], [
(work_file, work_file, "secrets should be kept"),
(work_file_lines_added, work_file, "should retain secret"),
(work_file_secret_removed, clean_file, "should restore redacted"),
])
def test_smudge_actual(capsys: CaptureFixture, workfile, expected, msg):
msg = "[Errno 2] No such file or directory: 'bar/foo_secrets.yml'"
err = FileNotFoundError(msg)
mock_workfile_path = Mock()
mock_workfile_path.open.side_effect = err
smudge(clean_file, ['foo_token:'], mock_workfile_path)
captured = capsys.readouterr()
assert captured.out == expected.getvalue(), msg
@pytest.fixture(autouse=True)
def reset_files():
for file in [work_file, clean_file]:
file.seek(0)
```
Thanks,
Kache
next reply other threads:[~2025-11-24 7:39 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-24 7:39 Kache Hit [this message]
2025-11-24 9:01 ` Filter smudge for secret restoration: no disk access? Johannes Sixt
2025-11-24 9:49 ` Chris Torek
2025-11-24 18:40 ` Kache Hit
2025-11-24 19:35 ` Junio C Hamano
2025-11-25 7:28 ` Kache Hit
2025-11-25 8:55 ` Chris Torek
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=DEGR5XSM0EVG.27IMOKOK1O98Y@gmail.com \
--to=kache.hit@gmail.com \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).