git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Filter smudge for secret restoration: no disk access?
@ 2025-11-24  7:39 Kache Hit
  2025-11-24  9:01 ` Johannes Sixt
  0 siblings, 1 reply; 7+ messages in thread
From: Kache Hit @ 2025-11-24  7:39 UTC (permalink / raw)
  To: git

I was working on a git redaction script that restores working copy
secrets when applied via `.gitattributes` clean/smudge filters, but
encountered `smudge` not having access to the "working file" on disk.

I see it's documented as intended in
https://git-scm.com/docs/gitattributes:

> Note that "%f" is the name of the path that is being worked on.
> Depending on the version that is being filtered, the corresponding
> file on disk may not exist, or may have different contents. So, smudge
> and clean commands should not try to access the file on disk, but only
> act as filters on the content provided to them on standard input.

Any chance there's a way around this or some alternative? Python
implementation below for reference.

And also for my understanding, why _shouldn't_ smudge access disk?

```py
#!/usr/bin/env python3
"""
Git clean/smudge filter for redactions that retains working secrets

If the following is in the repo as `bar/foo_secrets.yml`:
```
    foo_token: ##REDACTED##
    other: "not secret"
```

The local token won't be overwritten on checkout/restore:
```
    foo_token: secret_value
    other: "not secret"
```

Setup & example usage:

Save this file in repo root as `git_redact_filter.py`

`.gitattributes`:
```
bar/foo_secrets.yml filter=foo_token
```

`.gitconfig`:
```
[filter "foo_token"]
  clean = ./git_redact_filter.py --prefix foo_token:
  smudge = ./git_redact_filter.py --prefix foo_token: --smudge %f
```
"""
import inspect
import re
import sys
from argparse import ArgumentParser
from pathlib import Path
from typing import TextIO

REDACTED = '##REDACTED##'


def clean(workfile: TextIO, prefixes: list[str], out=None):
    pat = prefix_secret_rgx(prefixes)

    for line in workfile.readlines():
        if match := pat.match(line):
            print(match['prefix'] + REDACTED, file=out)
        else:
            print(line, end='', file=out)


def smudge(repofile: TextIO, prefixes: list[str], path: Path, out=None):
    pat = prefix_secret_rgx(prefixes)

    with path.open() as workfile:  # fails: FileNotFoundError
        secrets = {
            str(match['prefix']): match
            for match in map(pat.match, workfile.readlines())
            if match
        }

    for line in repofile.readlines():
        match = pat.match(line)
        secret = match and secrets.get(match['prefix'])

        if match and secret and match['secret'] == REDACTED:
            print(match['prefix'] + secret['secret'], file=out)
        else:
            print(line, end='', file=out)


def prefix_secret_rgx(prefixes_unsafe: list[str]):
    keys = '|'.join(map(re.escape, prefixes_unsafe))
    pat = rf"(?P<prefix>\s*({keys})\s*)(?P<secret>.*)"
    return re.compile(pat if keys else r'$^')


def heredoc(s: str):
    return inspect.cleandoc(s) + '\n'


def main():
    desc = "Git clean/smudge filter for redactions"
    list_arg = {'action': 'append', 'default': []}
    parser = ArgumentParser(description=desc)
    parser.add_argument('-p', '--prefix', **list_arg, metavar='PREFIX')
    parser.add_argument('--smudge', type=Path, metavar='PATH')
    args = parser.parse_args()

    if args.smudge:
        return smudge(sys.stdin, args.prefix, args.smudge)
    else:
        return clean(sys.stdin, args.prefix)


if __name__ == '__main__':
    sys.exit(main())


import io
from unittest.mock import Mock

import pytest
from pytest import CaptureFixture


work_file = io.StringIO(heredoc("""
    foo_token: secret_value
    other: "not secret"
"""))
clean_file = io.StringIO(heredoc("""
    foo_token: ##REDACTED##
    other: "not secret"
"""))

empty_file = io.StringIO()
work_file_secret_removed = io.StringIO(heredoc("""
    other: "not secret"
"""))
work_file_lines_added = io.StringIO(heredoc("""
    new_other: 123
    foo_token: secret_value
    other: "not secret"
"""))


def test_clean(capsys: CaptureFixture):
    clean(work_file, ['foo_token:'])
    captured = capsys.readouterr()
    assert captured.out == clean_file.getvalue(), "should be redacted"


def test_clean_idempotent():
    out, out2 = io.StringIO(), io.StringIO()
    clean(work_file, ['foo_token:'], out)
    clean(io.StringIO(out.getvalue()), ['foo_token:'], out2)
    assert out2.getvalue() == clean_file.getvalue()


@pytest.mark.parametrize(['workfile', 'expected', 'msg'], [
    (work_file,                work_file,  "secrets should be kept"),
    (work_file_lines_added,    work_file,  "should retain secret"),
    (work_file_secret_removed, clean_file, "should restore redacted"),
])
def test_smudge_goal(capsys: CaptureFixture, workfile, expected, msg):
    path = Mock()
    path.open.side_effect = lambda: io.StringIO(workfile.getvalue())

    smudge(clean_file, ['foo_token:'], path)
    captured = capsys.readouterr()
    assert captured.out == expected.getvalue(), msg


def test_smudge_idempotent():
    path = Mock()
    path.open.side_effect = lambda: io.StringIO(work_file.getvalue())
    cleaned, cleaned2 = io.StringIO(), io.StringIO()

    smudge(clean_file, ['foo_token:'], path, cleaned)
    cleaned.seek(0)
    smudge(cleaned, ['foo_token:'], path, cleaned2)
    assert cleaned.getvalue() == cleaned2.getvalue()


git_doc_url = "https://git-scm.com/docs/gitattributes"
@pytest.mark.xfail(reason=f"should access file on disk: {git_doc_url}")
@pytest.mark.parametrize(['workfile', 'expected', 'msg'], [
    (work_file,                work_file,  "secrets should be kept"),
    (work_file_lines_added,    work_file,  "should retain secret"),
    (work_file_secret_removed, clean_file, "should restore redacted"),
])
def test_smudge_actual(capsys: CaptureFixture, workfile, expected, msg):
    msg = "[Errno 2] No such file or directory: 'bar/foo_secrets.yml'"
    err = FileNotFoundError(msg)
    mock_workfile_path = Mock()
    mock_workfile_path.open.side_effect = err

    smudge(clean_file, ['foo_token:'], mock_workfile_path)
    captured = capsys.readouterr()
    assert captured.out == expected.getvalue(), msg


@pytest.fixture(autouse=True)
def reset_files():
    for file in [work_file, clean_file]:
        file.seek(0)
```

Thanks,

Kache

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-11-25  8:55 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-24  7:39 Filter smudge for secret restoration: no disk access? Kache Hit
2025-11-24  9:01 ` Johannes Sixt
2025-11-24  9:49   ` Chris Torek
2025-11-24 18:40     ` Kache Hit
2025-11-24 19:35       ` Junio C Hamano
2025-11-25  7:28         ` Kache Hit
2025-11-25  8:55       ` Chris Torek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).