RedBear-OS/local/scripts/audit-patch-idempotency.py

#!/usr/bin/env python3
"""Validate the idempotency of every external patch in local/patches/.

Per AGENTS.md "NO OVERLAY-STYLE PATCHES — AMENDED 2026" Rule 2, big
external projects use the cookbook's `cookbook_apply_patches` helper
which checks `git apply --reverse --check` to skip already-applied
patches. If a patch's reverse check fails (because the upstream
source drifted from the patch's expected state), the helper tries to

JSON SCHEMA (with --json):
  Top-level:
    patches:        [PatchEntry, ...]   one per patch in local/patches/
    total:          int                  len(patches)
    errors:         int                  count of all_errors across all entries
    skipped:        int                  count of entries that were --no-fetch
  Per-entry:
    component:      str                  e.g. "mesa", "libdrm"
    patch:          str                  filename, e.g. "01-foo.patch"
    status:         "ok" | "fail" | "skipped"
    errors:         [str, ...]           empty unless status == "fail"
  Exit code: 0 if errors == 0, else 1. With --no-fetch, all entries are
  "skipped" and the exit code is still 0, so the make lint-patches
  target chains should treat skipped_count == total as a soft failure.

apply the patch forward, which fails too because some hunks no
longer apply. The result is a confusing cook failure.

This script catches that class of bug at lint time. For every
[0-9]*.patch under local/patches/<component>/, it:

  1. Clones the upstream repo at the pinned rev into a temp dir
  2. Applies the patch
  3. Verifies `git apply --reverse --check` succeeds on the result
     (i.e. the patch is fully reversible — idempotency invariant)
  4. Re-applies the patch
  5. Verifies the source is byte-identical to step 2's result
     (i.e. the patch is idempotent — applying it twice = applying it once)
  6. Verifies the result is reproducible: re-clone, re-apply, byte-equal

If any check fails, the script exits non-zero and prints which patches
are non-idempotent. CI or `make lint` should run this on every PR.

Usage:
  ./local/scripts/audit-patch-idempotency.py [--component <name>] [--verbose]
"""
import argparse
import re
import shutil
import subprocess
import sys
import tempfile
import tomllib
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[2]
PATCHES_ROOT = PROJECT_ROOT / "local" / "patches"
SOURCE_ROOT = PROJECT_ROOT / "local" / "sources"
RECIPES_ROOT = PROJECT_ROOT / "local" / "recipes"
MAINLINE_RECIPES = PROJECT_ROOT / "recipes"

PATCH_NAME_RE = re.compile(r"^\d+-[A-Za-z0-9_.-]+\.patch$")
NUM_PREFIX_RE = re.compile(r"^(\d+)-")


def run(cmd, **kwargs):
    """Run a subprocess, returning (returncode, stdout, stderr)."""
    proc = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        check=False,
        **kwargs,
    )
    return proc.returncode, proc.stdout, proc.stderr


def collect_patches(component_filter=None):
    """Yield (component, patch_path) for every external patch."""
    if not PATCHES_ROOT.is_dir():
        return
    for component_dir in sorted(PATCHES_ROOT.iterdir()):
        if not component_dir.is_dir():
            continue
        if component_filter and component_dir.name != component_filter:
            continue
        for patch_path in sorted(component_dir.iterdir()):
            if patch_path.is_file() and PATCH_NAME_RE.match(patch_path.name):
                yield component_dir.name, patch_path


def resolve_upstream(component) -> "tuple[str | None, str | None] | tuple[str, str | None, Path]":
    """Return (url, rev) for a component by reading its mainline recipe.

    The component is matched by the recipe.toml's parent directory name
    (e.g. recipes/libs/mesa/recipe.toml matches component="mesa"),
    not the category. This means multiple categories with the same
    package name (e.g. recipes/wip/demos/mesa-demos) won't accidentally
    match.
    """
    candidates: list[tuple[str, str, Path]] = []
    for recipes_root in (RECIPES_ROOT, MAINLINE_RECIPES):
        if not recipes_root.is_dir():
            continue
        for recipe_toml in recipes_root.rglob("recipe.toml"):
            if "source" in recipe_toml.parts or "target" in recipe_toml.parts:
                continue
            if recipe_toml.parent.name != component:
                continue
            try:
                with open(recipe_toml, "rb") as f:
                    data = tomllib.load(f)
            except (OSError, tomllib.TOMLDecodeError):
                continue
            source = data.get("source") or {}
            if "git" in source:
                # Either pinned rev or branch tip — both are valid
                # upstream reference points for a patch's "from" state.
                if "rev" in source:
                    rev = str(source["rev"])
                elif "branch" in source:
                    # Branch resolution requires a network call to
                    # the upstream's `git ls-remote`. Patches that
                    # track a branch should ideally pin a rev for
                    # reproducibility; warn but proceed.
                    rev = f"refs/heads/{source['branch']}"
                else:
                    continue
                candidates.append((source["git"], rev, recipe_toml))
            elif "tar" in source:
                return ("tar", source.get("tar"), recipe_toml)
    if not candidates:
        return None, None
    if len(candidates) > 1:
        candidates.sort(key=lambda c: "local" in str(c[2]))
    url, rev, _ = candidates[0]
    return url, rev


def clone_source(url, rev, target):
    """Clone the upstream repo at the pinned rev into target/."""
    if target.exists():
        shutil.rmtree(target)
    target.mkdir(parents=True)
    rc, out, err = run(
        ["git", "clone", "--quiet", "--no-checkout", url, str(target)],
    )
    if rc != 0:
        return False, f"clone failed: {err.strip()}"
    rc, out, err = run(
        ["git", "-C", str(target), "checkout", "--quiet", rev],
    )
    if rc != 0:
        return False, f"checkout {rev} failed: {err.strip()}"
    return True, None


def apply_patch(source_dir, patch_path):
    """Apply patch in source_dir. Return (ok, error_msg)."""
    rc, out, err = run(
        ["git", "-C", str(source_dir), "apply", "--whitespace=nowarn", str(patch_path)],
    )
    if rc != 0:
        return False, (err or out).strip()
    return True, None


def check_reverse(source_dir, patch_path):
    """git apply --reverse --check. Returns (ok, error_msg)."""
    rc, out, err = run(
        ["git", "-C", str(source_dir), "apply", "--reverse", "--check", str(patch_path)],
    )
    if rc != 0:
        return False, (err or out).strip()
    return True, None


def diff_trees(a, b):
    """Return a unified diff between two source dirs, excluding .git/.

    The .git/ directory has timestamps and refs that always differ
    between clones, so we exclude it. The actual source tree is the
    signal we care about.
    """
    proc = subprocess.run(
        ["diff", "-ruN",
         "--exclude=.git",
         "--exclude=*.pyc", "--exclude=__pycache__",
         str(a), str(b)],
        capture_output=True, text=True, check=False,
    )
    return proc.stdout


def audit_one(component, patch_path, verbose=False):
    """Audit a single patch. Return a list of error strings (empty = OK)."""
    errors: list[str] = []
    upstream = resolve_upstream(component)
    if isinstance(upstream, tuple) and len(upstream) == 3 and upstream[0] == "tar":
        return [f"{component}/{patch_path.name}: tar-based source, "
                f"manual audit required"]
    if not upstream or upstream[0] is None:
        return [f"{component}/{patch_path.name}: no upstream recipe found "
                f"in local/recipes/ or recipes/"]
    url, rev = upstream[0], upstream[1]
    if url is None or rev is None:
        return [f"{component}/{patch_path.name}: could not resolve upstream "
                f"git URL or rev for component {component!r}"]
    url = str(url)
    rev = str(rev)

    # Phase 1: clone, apply, verify reverse + idempotency
    with tempfile.TemporaryDirectory(prefix="audit-patch-") as tmp:
        tmp_path = Path(tmp)
        work = tmp_path / "work"
        work2 = tmp_path / "work2"

        if verbose:
            print(f"  cloning {url} @ {rev[:12]}...")
        ok, err = clone_source(url, rev, work)
        if not ok:
            return [f"{component}/{patch_path.name}: clone failed: {err}"]
        # Apply once
        ok, err = apply_patch(work, patch_path)
        if not err:
            patch_applied_ok = True
        else:
            patch_applied_ok = False
            errors.append(f"{component}/{patch_path.name}: apply failed: {err}")

        if patch_applied_ok:
            # Reverse check (idempotency invariant)
            ok, rev_err = check_reverse(work, patch_path)
            if not ok:
                err_msg = rev_err or "unknown error"
                errors.append(
                    f"{component}/{patch_path.name}: --reverse --check FAILED — "
                    f"patch is not idempotent. Cookbook's cookbook_apply_patches "
                    f"will fail on a re-cook. Underlying error: {err_msg[:500]}"
                )
            # Idempotency: apply twice = apply once
            ok, err = apply_patch(work, patch_path)
            if not err:
                # The patch is now applied twice (or rather, applied when
                # already applied, which might fail). The cookbook's
                # --reverse --check is meant to skip this case. If the
                # second apply succeeded, the patch is non-idempotent
                # (applying twice is meaningful). If it failed, check
                # that the second failure is the expected "already
                # applied" error.
                errors.append(
                    f"{component}/{patch_path.name}: second apply SUCCEEDED — "
                    f"patch is not idempotent. Re-applying after a fresh "
                    f"cook will apply it twice. Cookbook should skip via "
                    f"--reverse --check; verify the helper still works."
                )
            else:
                # Expected: second apply fails. Confirm the working tree
                # is byte-identical to the first apply.
                if verbose:
                    print(f"  re-cloning to verify reproducibility...")
                ok, err = clone_source(url, rev, work2)
                if not ok:
                    errors.append(
                        f"{component}/{patch_path.name}: re-clone failed: {err}"
                    )
                else:
                    ok, err = apply_patch(work2, patch_path)
                    if err:
                        errors.append(
                            f"{component}/{patch_path.name}: "
                            f"reproducibility — second apply failed: {err}"
                        )
                    else:
                        diff_out = diff_trees(work, work2)
                        if diff_out:
                            errors.append(
                                f"{component}/{patch_path.name}: non-reproducible — "
                                f"second apply produces a different tree:\n"
                                f"{diff_out[:1000]}"
                            )
    return errors


def main():
    parser = argparse.ArgumentParser(
        description=(
            "Validate the idempotency of every external patch in "
            "local/patches/."
        )
    )
    parser.add_argument(
        "--component",
        help="Audit only the given component (default: all)",
    )
    parser.add_argument(
        "--verbose", "-v", action="store_true",
        help="Print progress as patches are checked",
    )
    parser.add_argument(
        "--no-fetch", action="store_true",
        help="Skip fetching upstream (useful when network is unavailable)",
    )
    parser.add_argument(
        "--json", action="store_true",
        help="Emit a machine-readable JSON summary on stdout "
             "(use for CI hooks or `make lint` integration).",
    )
    args = parser.parse_args()

    patches = list(collect_patches(args.component))
    if not patches:
        if args.json:
            import json
            print(json.dumps({"patches": [], "errors": 0, "skipped": 0}))
        else:
            print(f"No patches found{' for component ' + args.component if args.component else ''}.",
                  file=sys.stderr)
        return 0

    if not args.json:
        print(f"Auditing {len(patches)} patch(es)...")

    all_errors = []
    skipped = 0
    json_results = []
    for component, patch_path in patches:
        entry = {
            "component": component,
            "patch": patch_path.name,
            "status": "ok",
            "errors": [],
        }
        if args.verbose and not args.json:
            print(f"[{component}/{patch_path.name}]")
        if args.no_fetch:
            entry["status"] = "skipped"
            if not args.json:
                print(f"  {component}/{patch_path.name}: SKIPPED (--no-fetch)")
            skipped += 1
            json_results.append(entry)
            continue
        errors = audit_one(component, patch_path, verbose=args.verbose and not args.json)
        if errors:
            entry["status"] = "fail"
            entry["errors"] = list(errors)
            for e in errors:
                if not args.json:
                    print(f"  FAIL: {e}")
            all_errors.extend(errors)
        elif args.verbose and not args.json:
            print(f"  OK")
        json_results.append(entry)

    if args.json:
        import json
        print(json.dumps({
            "patches": json_results,
            "total": len(patches),
            "errors": len(all_errors),
            "skipped": skipped,
        }, indent=2))
        if skipped == len(patches):
            return 2
        return 0 if not all_errors else 1

    if all_errors:
        print()
        print(f"FAILED: {len(all_errors)} error(s) across {len(patches)} patch(es).")
        print()
        print("Common fixes:")
        print("  1. Patch hunks reference content that no longer exists in")
        print("     the upstream source. Re-generate the patch from a fresh")
        print("     checkout: git diff > local/patches/<component>/NN-...patch")
        print("  2. Patch is order-dependent with a sibling. The cookbook")
        print("     applies them in lexical order — make sure NN-prefix order")
        print("     matches the actual dependency order.")
        print("  3. Patch has whitespace conflicts with the upstream source.")
        print("     Try regenerating with `git diff --ignore-all-space`.")
        return 1
    if skipped == len(patches):
        print()
        print(f"All {len(patches)} patch(es) SKIPPED (--no-fetch). "
              "No audit was performed; the count of 0 errors is not a "
              "pass, just an absence of network-dependent checks.")
        return 2
    print(f"All {len(patches)} patch(es) are idempotent and reproducible.")
    return 0


if __name__ == "__main__":
    sys.exit(main())