git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Kache Hit" <kache.hit@gmail.com>
To: <git@vger.kernel.org>
Subject: Filter smudge for secret restoration: no disk access?
Date: Sun, 23 Nov 2025 23:39:22 -0800	[thread overview]
Message-ID: <DEGR5XSM0EVG.27IMOKOK1O98Y@gmail.com> (raw)

I was working on a git redaction script that restores working copy
secrets when applied via `.gitattributes` clean/smudge filters, but
encountered `smudge` not having access to the "working file" on disk.

I see it's documented as intended in
https://git-scm.com/docs/gitattributes:

> Note that "%f" is the name of the path that is being worked on.
> Depending on the version that is being filtered, the corresponding
> file on disk may not exist, or may have different contents. So, smudge
> and clean commands should not try to access the file on disk, but only
> act as filters on the content provided to them on standard input.

Any chance there's a way around this or some alternative? Python
implementation below for reference.

And also for my understanding, why _shouldn't_ smudge access disk?

```py
#!/usr/bin/env python3
"""
Git clean/smudge filter for redactions that retains working secrets

If the following is in the repo as `bar/foo_secrets.yml`:
```
    foo_token: ##REDACTED##
    other: "not secret"
```

The local token won't be overwritten on checkout/restore:
```
    foo_token: secret_value
    other: "not secret"
```

Setup & example usage:

Save this file in repo root as `git_redact_filter.py`

`.gitattributes`:
```
bar/foo_secrets.yml filter=foo_token
```

`.gitconfig`:
```
[filter "foo_token"]
  clean = ./git_redact_filter.py --prefix foo_token:
  smudge = ./git_redact_filter.py --prefix foo_token: --smudge %f
```
"""
import inspect
import re
import sys
from argparse import ArgumentParser
from pathlib import Path
from typing import TextIO

REDACTED = '##REDACTED##'


def clean(workfile: TextIO, prefixes: list[str], out=None):
    pat = prefix_secret_rgx(prefixes)

    for line in workfile.readlines():
        if match := pat.match(line):
            print(match['prefix'] + REDACTED, file=out)
        else:
            print(line, end='', file=out)


def smudge(repofile: TextIO, prefixes: list[str], path: Path, out=None):
    pat = prefix_secret_rgx(prefixes)

    with path.open() as workfile:  # fails: FileNotFoundError
        secrets = {
            str(match['prefix']): match
            for match in map(pat.match, workfile.readlines())
            if match
        }

    for line in repofile.readlines():
        match = pat.match(line)
        secret = match and secrets.get(match['prefix'])

        if match and secret and match['secret'] == REDACTED:
            print(match['prefix'] + secret['secret'], file=out)
        else:
            print(line, end='', file=out)


def prefix_secret_rgx(prefixes_unsafe: list[str]):
    keys = '|'.join(map(re.escape, prefixes_unsafe))
    pat = rf"(?P<prefix>\s*({keys})\s*)(?P<secret>.*)"
    return re.compile(pat if keys else r'$^')


def heredoc(s: str):
    return inspect.cleandoc(s) + '\n'


def main():
    desc = "Git clean/smudge filter for redactions"
    list_arg = {'action': 'append', 'default': []}
    parser = ArgumentParser(description=desc)
    parser.add_argument('-p', '--prefix', **list_arg, metavar='PREFIX')
    parser.add_argument('--smudge', type=Path, metavar='PATH')
    args = parser.parse_args()

    if args.smudge:
        return smudge(sys.stdin, args.prefix, args.smudge)
    else:
        return clean(sys.stdin, args.prefix)


if __name__ == '__main__':
    sys.exit(main())


import io
from unittest.mock import Mock

import pytest
from pytest import CaptureFixture


work_file = io.StringIO(heredoc("""
    foo_token: secret_value
    other: "not secret"
"""))
clean_file = io.StringIO(heredoc("""
    foo_token: ##REDACTED##
    other: "not secret"
"""))

empty_file = io.StringIO()
work_file_secret_removed = io.StringIO(heredoc("""
    other: "not secret"
"""))
work_file_lines_added = io.StringIO(heredoc("""
    new_other: 123
    foo_token: secret_value
    other: "not secret"
"""))


def test_clean(capsys: CaptureFixture):
    clean(work_file, ['foo_token:'])
    captured = capsys.readouterr()
    assert captured.out == clean_file.getvalue(), "should be redacted"


def test_clean_idempotent():
    out, out2 = io.StringIO(), io.StringIO()
    clean(work_file, ['foo_token:'], out)
    clean(io.StringIO(out.getvalue()), ['foo_token:'], out2)
    assert out2.getvalue() == clean_file.getvalue()


@pytest.mark.parametrize(['workfile', 'expected', 'msg'], [
    (work_file,                work_file,  "secrets should be kept"),
    (work_file_lines_added,    work_file,  "should retain secret"),
    (work_file_secret_removed, clean_file, "should restore redacted"),
])
def test_smudge_goal(capsys: CaptureFixture, workfile, expected, msg):
    path = Mock()
    path.open.side_effect = lambda: io.StringIO(workfile.getvalue())

    smudge(clean_file, ['foo_token:'], path)
    captured = capsys.readouterr()
    assert captured.out == expected.getvalue(), msg


def test_smudge_idempotent():
    path = Mock()
    path.open.side_effect = lambda: io.StringIO(work_file.getvalue())
    cleaned, cleaned2 = io.StringIO(), io.StringIO()

    smudge(clean_file, ['foo_token:'], path, cleaned)
    cleaned.seek(0)
    smudge(cleaned, ['foo_token:'], path, cleaned2)
    assert cleaned.getvalue() == cleaned2.getvalue()


git_doc_url = "https://git-scm.com/docs/gitattributes"
@pytest.mark.xfail(reason=f"should access file on disk: {git_doc_url}")
@pytest.mark.parametrize(['workfile', 'expected', 'msg'], [
    (work_file,                work_file,  "secrets should be kept"),
    (work_file_lines_added,    work_file,  "should retain secret"),
    (work_file_secret_removed, clean_file, "should restore redacted"),
])
def test_smudge_actual(capsys: CaptureFixture, workfile, expected, msg):
    msg = "[Errno 2] No such file or directory: 'bar/foo_secrets.yml'"
    err = FileNotFoundError(msg)
    mock_workfile_path = Mock()
    mock_workfile_path.open.side_effect = err

    smudge(clean_file, ['foo_token:'], mock_workfile_path)
    captured = capsys.readouterr()
    assert captured.out == expected.getvalue(), msg


@pytest.fixture(autouse=True)
def reset_files():
    for file in [work_file, clean_file]:
        file.seek(0)
```

Thanks,

Kache

             reply	other threads:[~2025-11-24  7:39 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-24  7:39 Kache Hit [this message]
2025-11-24  9:01 ` Filter smudge for secret restoration: no disk access? Johannes Sixt
2025-11-24  9:49   ` Chris Torek
2025-11-24 18:40     ` Kache Hit
2025-11-24 19:35       ` Junio C Hamano
2025-11-25  7:28         ` Kache Hit
2025-11-25  8:55       ` Chris Torek

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=DEGR5XSM0EVG.27IMOKOK1O98Y@gmail.com \
    --to=kache.hit@gmail.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).