All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Kache Hit" <kache.hit@gmail.com>
To: <git@vger.kernel.org>
Subject: Filter smudge for secret restoration: no disk access?
Date: Sun, 23 Nov 2025 23:39:22 -0800	[thread overview]
Message-ID: <DEGR5XSM0EVG.27IMOKOK1O98Y@gmail.com> (raw)

I was working on a git redaction script that restores working copy
secrets when applied via `.gitattributes` clean/smudge filters, but
encountered `smudge` not having access to the "working file" on disk.

I see it's documented as intended in
https://git-scm.com/docs/gitattributes:

> Note that "%f" is the name of the path that is being worked on.
> Depending on the version that is being filtered, the corresponding
> file on disk may not exist, or may have different contents. So, smudge
> and clean commands should not try to access the file on disk, but only
> act as filters on the content provided to them on standard input.

Any chance there's a way around this or some alternative? Python
implementation below for reference.

And also for my understanding, why _shouldn't_ smudge access disk?

```py
#!/usr/bin/env python3
"""
Git clean/smudge filter for redactions that retains working secrets

If the following is in the repo as `bar/foo_secrets.yml`:
```
    foo_token: ##REDACTED##
    other: "not secret"
```

The local token won't be overwritten on checkout/restore:
```
    foo_token: secret_value
    other: "not secret"
```

Setup & example usage:

Save this file in repo root as `git_redact_filter.py`

`.gitattributes`:
```
bar/foo_secrets.yml filter=foo_token
```

`.gitconfig`:
```
[filter "foo_token"]
  clean = ./git_redact_filter.py --prefix foo_token:
  smudge = ./git_redact_filter.py --prefix foo_token: --smudge %f
```
"""
import inspect
import re
import sys
from argparse import ArgumentParser
from pathlib import Path
from typing import TextIO

REDACTED = '##REDACTED##'


def clean(workfile: TextIO, prefixes: list[str], out=None):
    pat = prefix_secret_rgx(prefixes)

    for line in workfile.readlines():
        if match := pat.match(line):
            print(match['prefix'] + REDACTED, file=out)
        else:
            print(line, end='', file=out)


def smudge(repofile: TextIO, prefixes: list[str], path: Path, out=None):
    pat = prefix_secret_rgx(prefixes)

    with path.open() as workfile:  # fails: FileNotFoundError
        secrets = {
            str(match['prefix']): match
            for match in map(pat.match, workfile.readlines())
            if match
        }

    for line in repofile.readlines():
        match = pat.match(line)
        secret = match and secrets.get(match['prefix'])

        if match and secret and match['secret'] == REDACTED:
            print(match['prefix'] + secret['secret'], file=out)
        else:
            print(line, end='', file=out)


def prefix_secret_rgx(prefixes_unsafe: list[str]):
    keys = '|'.join(map(re.escape, prefixes_unsafe))
    pat = rf"(?P<prefix>\s*({keys})\s*)(?P<secret>.*)"
    return re.compile(pat if keys else r'$^')


def heredoc(s: str):
    return inspect.cleandoc(s) + '\n'


def main():
    desc = "Git clean/smudge filter for redactions"
    list_arg = {'action': 'append', 'default': []}
    parser = ArgumentParser(description=desc)
    parser.add_argument('-p', '--prefix', **list_arg, metavar='PREFIX')
    parser.add_argument('--smudge', type=Path, metavar='PATH')
    args = parser.parse_args()

    if args.smudge:
        return smudge(sys.stdin, args.prefix, args.smudge)
    else:
        return clean(sys.stdin, args.prefix)


if __name__ == '__main__':
    sys.exit(main())


import io
from unittest.mock import Mock

import pytest
from pytest import CaptureFixture


work_file = io.StringIO(heredoc("""
    foo_token: secret_value
    other: "not secret"
"""))
clean_file = io.StringIO(heredoc("""
    foo_token: ##REDACTED##
    other: "not secret"
"""))

empty_file = io.StringIO()
work_file_secret_removed = io.StringIO(heredoc("""
    other: "not secret"
"""))
work_file_lines_added = io.StringIO(heredoc("""
    new_other: 123
    foo_token: secret_value
    other: "not secret"
"""))


def test_clean(capsys: CaptureFixture):
    clean(work_file, ['foo_token:'])
    captured = capsys.readouterr()
    assert captured.out == clean_file.getvalue(), "should be redacted"


def test_clean_idempotent():
    out, out2 = io.StringIO(), io.StringIO()
    clean(work_file, ['foo_token:'], out)
    clean(io.StringIO(out.getvalue()), ['foo_token:'], out2)
    assert out2.getvalue() == clean_file.getvalue()


@pytest.mark.parametrize(['workfile', 'expected', 'msg'], [
    (work_file,                work_file,  "secrets should be kept"),
    (work_file_lines_added,    work_file,  "should retain secret"),
    (work_file_secret_removed, clean_file, "should restore redacted"),
])
def test_smudge_goal(capsys: CaptureFixture, workfile, expected, msg):
    path = Mock()
    path.open.side_effect = lambda: io.StringIO(workfile.getvalue())

    smudge(clean_file, ['foo_token:'], path)
    captured = capsys.readouterr()
    assert captured.out == expected.getvalue(), msg


def test_smudge_idempotent():
    path = Mock()
    path.open.side_effect = lambda: io.StringIO(work_file.getvalue())
    cleaned, cleaned2 = io.StringIO(), io.StringIO()

    smudge(clean_file, ['foo_token:'], path, cleaned)
    cleaned.seek(0)
    smudge(cleaned, ['foo_token:'], path, cleaned2)
    assert cleaned.getvalue() == cleaned2.getvalue()


git_doc_url = "https://git-scm.com/docs/gitattributes"
@pytest.mark.xfail(reason=f"should access file on disk: {git_doc_url}")
@pytest.mark.parametrize(['workfile', 'expected', 'msg'], [
    (work_file,                work_file,  "secrets should be kept"),
    (work_file_lines_added,    work_file,  "should retain secret"),
    (work_file_secret_removed, clean_file, "should restore redacted"),
])
def test_smudge_actual(capsys: CaptureFixture, workfile, expected, msg):
    msg = "[Errno 2] No such file or directory: 'bar/foo_secrets.yml'"
    err = FileNotFoundError(msg)
    mock_workfile_path = Mock()
    mock_workfile_path.open.side_effect = err

    smudge(clean_file, ['foo_token:'], mock_workfile_path)
    captured = capsys.readouterr()
    assert captured.out == expected.getvalue(), msg


@pytest.fixture(autouse=True)
def reset_files():
    for file in [work_file, clean_file]:
        file.seek(0)
```

Thanks,

Kache

             reply	other threads:[~2025-11-24  7:39 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-24  7:39 Kache Hit [this message]
2025-11-24  9:01 ` Filter smudge for secret restoration: no disk access? Johannes Sixt
2025-11-24  9:49   ` Chris Torek
2025-11-24 18:40     ` Kache Hit
2025-11-24 19:35       ` Junio C Hamano
2025-11-25  7:28         ` Kache Hit
2025-11-25  8:55       ` Chris Torek

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=DEGR5XSM0EVG.27IMOKOK1O98Y@gmail.com \
    --to=kache.hit@gmail.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.