#!/usr/bin/env python3 """ git_audit.py - Recursively scan a directory tree for Git repositories (including submodules), collect metadata, and produce a Markdown report with actionable recommendations.

Handles:

Nested repos and git submodules (.gitmodules parsing)
Symlinked submodule paths (graceful skip)
Missing remotes, empty repos, detached HEADs
Unstaged, untracked, and staged-but-uncommitted changes

Usage: python git_audit.py [ROOT] [--output FILE] [--days N] [--json]

Examples: python git_audit.py ~/Projects python git_audit.py ~/Projects --output audit.md --days 30 python git_audit.py ~/Projects --json --output audit.json """

from future import annotations

import argparse import json import os import subprocess import sys from dataclasses import dataclass, field, asdict from datetime import datetime, timezone, timedelta from pathlib import Path from typing import Optional

── Data Model ──────────────────────────────────────────────────────────────

@dataclass class RepoInfo: path: str remote_url: Optional[str] = None current_branch: Optional[str] = None last_commit_iso: Optional[str] = None last_push_iso: Optional[str] = None untracked_count: int = 0 unstaged_count: int = 0 staged_count: int = 0 is_clean: bool = True has_remote: bool = False days_since_last_activity: Optional[int] = None issues: list[str] = field(default_factory=list)

── Git Helpers ─────────────────────────────────────────────────────────────

def run_git(repo: Path, *args: str) -> Optional[str]: """Run a git command inside repo. Return stdout or None on failure.""" try: result = subprocess.run( ["git", "-C", str(repo), *args], capture_output=True, text=True, timeout=15, ) if result.returncode != 0: # Silently skip known non-fatal errors stderr = result.stderr or "" if any(tok in stderr for tok in [ "expected submodule path", "not a git repository", "does not have any commits", ]): return None return None return result.stdout.strip() except (subprocess.TimeoutExpired, FileNotFoundError): return None

def is_git_repo(path: Path) -> bool: """True if path contains a .git directory OR a .git file (submodule).""" dot_git = path / ".git" return dot_git.is_dir() or dot_git.is_file()

── Repo Discovery ─────────────────────────────────────────────────────────

def parse_gitmodules(repo: Path) -> list[Path]: """Extract submodule paths from .gitmodules.""" gitmodules = repo / ".gitmodules" if not gitmodules.is_file(): return []

paths: list[Path] = []
for line in gitmodules.read_text(encoding="utf-8", errors="replace").splitlines():
    stripped = line.strip()
    if stripped.startswith("path"):
        parts = stripped.split("=", 1)
        if len(parts) == 2:
            sub_path = parts[1].strip()
            candidate = repo / sub_path
            if candidate.exists():
                paths.append(candidate)
return paths

def find_git_repos(root: Path) -> list[Path]: """Walk root, find all git repos including submodules.""" repos: set[Path] = set()

for dirpath, dirnames, _filenames in os.walk(root, followlinks=False):
    current = Path(dirpath)

    # Skip inside .git internals
    if ".git" in current.parts:
        dirnames.clear()
        continue

    if is_git_repo(current):
        repos.add(current.resolve())
        # Don't descend into .git itself, but DO continue into
        # child dirs (they might contain submodules)
        if ".git" in dirnames:
            dirnames.remove(".git")

# Second pass: explicitly add submodules
for repo in list(repos):
    for sub in parse_gitmodules(repo):
        resolved = sub.resolve()
        if is_git_repo(resolved):
            repos.add(resolved)

return sorted(repos)

── Metadata Collection ─────────────────────────────────────────────────────

def gather_metadata(repo: Path, stale_days: int = 30) -> RepoInfo: """Collect detailed metadata for a single repo.""" info = RepoInfo(path=str(repo))

# Remote
info.remote_url = run_git(repo, "config", "--get", "remote.origin.url")
info.has_remote = info.remote_url is not None

# Current branch
info.current_branch = run_git(repo, "rev-parse", "--abbrev-ref", "HEAD")

# Last commit timestamp
info.last_commit_iso = run_git(repo, "log", "-1", "--format=%cI")

# Last push approximation: latest commit on any remote-tracking branch
refs_output = run_git(repo, "for-each-ref",
                      "--sort=-committerdate",
                      "--count=1",
                      "--format=%(committerdate:iso-strict)",
                      "refs/remotes/")
info.last_push_iso = refs_output if refs_output else None

# Porcelain status parse
status_output = run_git(repo, "status", "--porcelain")
if status_output:
    for line in status_output.splitlines():
        if len(line) < 2:
            continue
        index_status = line[0]
        worktree_status = line[1]
        if worktree_status in ("M", "D", "T"):
            info.unstaged_count += 1
        if index_status in ("A", "M", "D", "R", "C"):
            info.staged_count += 1
        if index_status == "?" and worktree_status == "?":
            info.untracked_count += 1

info.is_clean = (
    info.untracked_count == 0
    and info.unstaged_count == 0
    and info.staged_count == 0
)

# Days since last activity
ts_str = info.last_push_iso or info.last_commit_iso
if ts_str:
    try:
        last_dt = datetime.fromisoformat(ts_str)
        now = datetime.now(timezone.utc)
        if last_dt.tzinfo is None:
            last_dt = last_dt.replace(tzinfo=timezone.utc)
        delta = now - last_dt
        info.days_since_last_activity = delta.days
    except (ValueError, TypeError):
        pass

# Build issue list
if not info.has_remote:
    info.issues.append("NO_REMOTE")
if info.unstaged_count > 0:
    info.issues.append(f"UNSTAGED({info.unstaged_count})")
if info.untracked_count > 0:
    info.issues.append(f"UNTRACKED({info.untracked_count})")
if info.staged_count > 0:
    info.issues.append(f"STAGED_UNCOMMITTED({info.staged_count})")
if info.days_since_last_activity is not None and info.days_since_last_activity > stale_days:
    info.issues.append(f"STALE({info.days_since_last_activity}d)")
if info.last_commit_iso is None:
    info.issues.append("EMPTY_REPO")

return info

── Report Generation ───────────────────────────────────────────────────────

def generate_markdown_report(repos: list[RepoInfo], root: str, stale_days: int) -> str: """Produce a full Markdown audit report.""" now_str = datetime.now().strftime("%Y-%m-%d %H:%M") total = len(repos) clean = sum(1 for r in repos if r.is_clean) dirty = total - clean no_remote = sum(1 for r in repos if not r.has_remote) stale = sum(1 for r in repos if r.days_since_last_activity is not None and r.days_since_last_activity > stale_days) empty = sum(1 for r in repos if r.last_commit_iso is None)

lines: list[str] = []
w = lines.append

w(f"# Git Repository Audit Report")
w(f"")
w(f"**Scan root:** `{root}`")
w(f"**Generated:** {now_str}")
w(f"**Stale threshold:** {stale_days} days")
w(f"")
w(f"## Summary")
w(f"")
w(f"| Metric | Count |")
w(f"|--------|------:|")
w(f"| Total repositories | {total} |")
w(f"| Clean | {clean} |")
w(f"| Dirty (unstaged/untracked/staged) | {dirty} |")
w(f"| No remote configured | {no_remote} |")
w(f"| Stale (>{stale_days} days) | {stale} |")
w(f"| Empty (no commits) | {empty} |")
w(f"")

# Detailed table
w(f"## Repository Details")
w(f"")
w(f"| Path | Branch | Last Activity | Remote | Status | Issues |")
w(f"|------|--------|--------------|--------|--------|--------|")
for r in repos:
    path = r.path
    branch = r.current_branch or "—"
    activity = r.last_push_iso or r.last_commit_iso or "never"
    if len(activity) > 25:
        activity = activity[:25]
    remote = "✓" if r.has_remote else "✗"
    status = "CLEAN" if r.is_clean else "DIRTY"
    issues = ", ".join(r.issues) if r.issues else "—"
    w(f"| `{path}` | {branch} | {activity} | {remote} | {status} | {issues} |")

w(f"")

# Opportunities to improve
dirty_repos = [r for r in repos if not r.is_clean]
no_remote_repos = [r for r in repos if not r.has_remote]
stale_repos = [r for r in repos if r.days_since_last_activity is not None
               and r.days_since_last_activity > stale_days]

w(f"## Opportunities to Improve")
w(f"")

if dirty_repos:
    w(f"### Repositories with Uncommitted Changes")
    w(f"")
    for r in dirty_repos:
        parts = []
        if r.unstaged_count:
            parts.append(f"{r.unstaged_count} unstaged")
        if r.untracked_count:
            parts.append(f"{r.untracked_count} untracked")
        if r.staged_count:
            parts.append(f"{r.staged_count} staged but uncommitted")
        detail = ", ".join(parts)
        w(f"- **`{r.path}`** — {detail}")
    w(f"")

if no_remote_repos:
    w(f"### Repositories with No Remote")
    w(f"")
    for r in no_remote_repos:
        w(f"- **`{r.path}`** — consider adding a remote or archiving")
    w(f"")

if stale_repos:
    w(f"### Stale Repositories (>{stale_days} days since last activity)")
    w(f"")
    for r in stale_repos:
        d = r.days_since_last_activity
        w(f"- **`{r.path}`** — {d} days since last activity")
    w(f"")

# Action plan
w(f"## Recommended Actions")
w(f"")
w(f"1. **Commit or stash dirty repos** — review each flagged repo and either "
  f"`git add . && git commit` or `git stash`.")
w(f"2. **Add remotes to orphan repos** — run `git remote add origin <url>` or "
  f"archive/delete if no longer needed.")
w(f"3. **Sync stale repos** — `git pull` and `git push` to bring them current, "
  f"or archive if abandoned.")
w(f"4. **Automate this scan** — add to cron/launchd for periodic hygiene checks.")
w(f"")
w(f"---")
w(f"*Generated by git_audit.py*")

return "\n".join(lines)

── Entry Point ─────────────────────────────────────────────────────────────

def main() -> None: parser = argparse.ArgumentParser( description="Scan for Git repos and generate an audit report.", formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( "root", nargs="?", default=".", help="Root directory to scan (default: current directory)", ) parser.add_argument( "-o", "--output", default=None, help="Output file path (default: stdout)", ) parser.add_argument( "--days", type=int, default=30, help="Threshold in days to flag stale repos (default: 30)", ) parser.add_argument( "--json", action="store_true", help="Output raw JSON instead of Markdown", ) args = parser.parse_args()

root = Path(args.root).expanduser().resolve()
if not root.is_dir():
    print(f"Error: {root} is not a directory.", file=sys.stderr)
    sys.exit(1)

print(f"Scanning {root} ...", file=sys.stderr)
repo_paths = find_git_repos(root)
print(f"Found {len(repo_paths)} repositories.", file=sys.stderr)

if not repo_paths:
    print("No Git repositories found.", file=sys.stderr)
    sys.exit(0)

print(f"Collecting metadata ...", file=sys.stderr)
repo_infos = [gather_metadata(r, stale_days=args.days) for r in repo_paths]

# Generate output
if args.json:
    output = json.dumps([asdict(r) for r in repo_infos], indent=2)
else:
    output = generate_markdown_report(repo_infos, str(root), args.days)

# Write
if args.output:
    out_path = Path(args.output).expanduser()
    out_path.write_text(output, encoding="utf-8")
    print(f"Report written to {out_path}", file=sys.stderr)
else:
    print(output)

if name == "main": main()