memex/scripts/wiki-staging.py

#!/usr/bin/env python3
"""Human-in-the-loop staging pipeline for wiki content.

Pure file operations — no LLM calls. Moves pages between staging/ and the live
wiki, updates indexes, rewrites cross-references, and tracks rejections in
.harvest-state.json.

Usage:
    python3 scripts/wiki-staging.py --list                       # List pending items
    python3 scripts/wiki-staging.py --list --json                # JSON output
    python3 scripts/wiki-staging.py --stats                      # Summary by type and age
    python3 scripts/wiki-staging.py --promote PATH               # Approve one page
    python3 scripts/wiki-staging.py --reject PATH --reason "..." # Reject with reason
    python3 scripts/wiki-staging.py --promote-all                # Approve everything
    python3 scripts/wiki-staging.py --review                     # Interactive approval loop
    python3 scripts/wiki-staging.py --sync                       # Rebuild staging/index.md

PATH may be relative to the wiki root (e.g. `staging/patterns/foo.md`) or absolute.
"""

from __future__ import annotations

import argparse
import json
import re
import sys
from datetime import date
from pathlib import Path
from typing import Any

# Import shared helpers
sys.path.insert(0, str(Path(__file__).parent))
from wiki_lib import (  # noqa: E402
    ARCHIVE_DIR,
    CONVERSATIONS_DIR,
    HARVEST_STATE_FILE,
    INDEX_FILE,
    LIVE_CONTENT_DIRS,
    REPORTS_DIR,
    STAGING_DIR,
    STAGING_INDEX,
    WIKI_DIR,
    WikiPage,
    iter_live_pages,
    iter_staging_pages,
    parse_date,
    parse_page,
    today,
    write_page,
)

sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)

# Fields stripped from frontmatter on promotion (staging-only metadata)
STAGING_ONLY_FIELDS = [
    "status",
    "staged_date",
    "staged_by",
    "target_path",
    "modifies",
    "compilation_notes",
]

# ---------------------------------------------------------------------------
# Discovery
# ---------------------------------------------------------------------------


def list_pending() -> list[WikiPage]:
    pages = [p for p in iter_staging_pages() if p.path.name != "index.md"]
    return pages


def page_summary(page: WikiPage) -> dict[str, Any]:
    rel = str(page.path.relative_to(WIKI_DIR))
    fm = page.frontmatter
    target = fm.get("target_path") or _infer_target_path(page)
    staged = parse_date(fm.get("staged_date"))
    age = (today() - staged).days if staged else None
    return {
        "path": rel,
        "title": fm.get("title", page.path.stem),
        "type": fm.get("type", _infer_type(page)),
        "status": fm.get("status", "pending"),
        "origin": fm.get("origin", "automated"),
        "staged_by": fm.get("staged_by", "unknown"),
        "staged_date": str(staged) if staged else None,
        "age_days": age,
        "target_path": target,
        "modifies": fm.get("modifies"),
        "compilation_notes": fm.get("compilation_notes", ""),
    }


def _infer_target_path(page: WikiPage) -> str:
    """Derive a target path when target_path isn't set in frontmatter."""
    try:
        rel = page.path.relative_to(STAGING_DIR)
    except ValueError:
        return str(page.path.relative_to(WIKI_DIR))
    return str(rel)


def _infer_type(page: WikiPage) -> str:
    """Infer type from the directory name when frontmatter doesn't specify it."""
    parts = page.path.relative_to(STAGING_DIR).parts
    if len(parts) >= 2 and parts[0] in LIVE_CONTENT_DIRS:
        return parts[0].rstrip("s")  # 'patterns' → 'pattern'
    return "unknown"


# ---------------------------------------------------------------------------
# Main index update
# ---------------------------------------------------------------------------


def _remove_from_main_index(rel_path: str) -> None:
    if not INDEX_FILE.exists():
        return
    text = INDEX_FILE.read_text()
    lines = text.splitlines(keepends=True)
    pattern = re.compile(rf"^- \[.+\]\({re.escape(rel_path)}\) ")
    new_lines = [line for line in lines if not pattern.match(line)]
    if len(new_lines) != len(lines):
        INDEX_FILE.write_text("".join(new_lines))


def _add_to_main_index(rel_path: str, title: str, summary: str = "") -> None:
    """Append a new entry under the appropriate section. Best-effort — operator may re-order later."""
    if not INDEX_FILE.exists():
        return
    text = INDEX_FILE.read_text()
    # Avoid duplicates
    if f"]({rel_path})" in text:
        return
    entry = f"- [{title}]({rel_path})"
    if summary:
        entry += f" — {summary}"
    entry += "\n"
    # Insert at the end of the first matching section
    ptype = rel_path.split("/")[0]
    section_headers = {
        "patterns": "## Patterns",
        "decisions": "## Decisions",
        "concepts": "## Concepts",
        "environments": "## Environments",
    }
    header = section_headers.get(ptype)
    if header and header in text:
        # Find the header and append before the next ## header or EOF
        idx = text.find(header)
        next_header = text.find("\n## ", idx + len(header))
        if next_header == -1:
            next_header = len(text)
        # Find the last non-empty line in the section
        section = text[idx:next_header]
        last_nl = section.rfind("\n", 0, len(section) - 1) + 1
        INDEX_FILE.write_text(text[: idx + last_nl] + entry + text[idx + last_nl :])
    else:
        INDEX_FILE.write_text(text.rstrip() + "\n" + entry)


# ---------------------------------------------------------------------------
# Staging index update
# ---------------------------------------------------------------------------


def regenerate_staging_index() -> None:
    STAGING_DIR.mkdir(parents=True, exist_ok=True)
    pending = list_pending()

    lines = [
        "# Staging — Pending Wiki Content",
        "",
        "Content awaiting human review. These pages were generated by automated scripts",
        "and need approval before joining the live wiki.",
        "",
        "**Review options**:",
        "- Browse in Obsidian and move files manually (then run `scripts/wiki-staging.py --sync`)",
        "- Run `python3 scripts/wiki-staging.py --list` for a summary",
        "- Start a Claude session: \"let's review what's in staging\"",
        "",
        f"**{len(pending)} pending item(s)** as of {today().isoformat()}",
        "",
        "## Pending Items",
        "",
    ]

    if not pending:
        lines.append("_No pending items._")
    else:
        lines.append("| Page | Type | Source | Staged | Age | Target |")
        lines.append("|------|------|--------|--------|-----|--------|")
        for page in pending:
            s = page_summary(page)
            title = s["title"]
            rel_in_staging = str(page.path.relative_to(STAGING_DIR))
            age = f"{s['age_days']}d" if s["age_days"] is not None else "—"
            staged = s["staged_date"] or "—"
            lines.append(
                f"| [{title}]({rel_in_staging}) | {s['type']} | "
                f"{s['staged_by']} | {staged} | {age} | `{s['target_path']}` |"
            )

    STAGING_INDEX.write_text("\n".join(lines) + "\n")


# ---------------------------------------------------------------------------
# Cross-reference rewriting
# ---------------------------------------------------------------------------


def _rewrite_cross_references(old_path: str, new_path: str) -> int:
    """Rewrite links and `related:` entries across the wiki."""
    targets: list[Path] = [INDEX_FILE]
    for sub in LIVE_CONTENT_DIRS:
        targets.extend((WIKI_DIR / sub).glob("*.md"))
    if STAGING_DIR.exists():
        for sub in LIVE_CONTENT_DIRS:
            targets.extend((STAGING_DIR / sub).glob("*.md"))
    if ARCHIVE_DIR.exists():
        for sub in LIVE_CONTENT_DIRS:
            targets.extend((ARCHIVE_DIR / sub).glob("*.md"))

    count = 0
    old_esc = re.escape(old_path)
    link_patterns = [
        (re.compile(rf"\]\({old_esc}\)"), f"]({new_path})"),
        (re.compile(rf"\]\(\.\./{old_esc}\)"), f"](../{new_path})"),
    ]
    related_patterns = [
        (re.compile(rf"^(\s*-\s*){old_esc}$", re.MULTILINE), rf"\g<1>{new_path}"),
    ]
    for target in targets:
        if not target.exists():
            continue
        try:
            text = target.read_text()
        except OSError:
            continue
        new_text = text
        for pat, repl in link_patterns + related_patterns:
            new_text = pat.sub(repl, new_text)
        if new_text != text:
            target.write_text(new_text)
            count += 1
    return count


# ---------------------------------------------------------------------------
# Promote
# ---------------------------------------------------------------------------


def promote(page: WikiPage, dry_run: bool = False) -> Path | None:
    summary = page_summary(page)
    target_rel = summary["target_path"]
    target_path = WIKI_DIR / target_rel

    modifies = summary["modifies"]
    if modifies:
        # This is an update to an existing page. Merge: keep staging content,
        # preserve the live page's origin if it was manual.
        live_path = WIKI_DIR / modifies
        if not live_path.exists():
            print(
                f"  [warn] modifies target {modifies} does not exist — treating as new page",
                file=sys.stderr,
            )
            modifies = None
        else:
            live_page = parse_page(live_path)
            if live_page:
                # Warn if live page has been updated since staging
                live_compiled = parse_date(live_page.frontmatter.get("last_compiled"))
                staged = parse_date(page.frontmatter.get("staged_date"))
                if live_compiled and staged and live_compiled > staged:
                    print(
                        f"  [warn] live page {modifies} was updated ({live_compiled}) "
                        f"after staging ({staged}) — human should verify merge",
                        file=sys.stderr,
                    )
                # Preserve origin from live if it was manual
                if live_page.frontmatter.get("origin") == "manual":
                    page.frontmatter["origin"] = "manual"

    rel_src = str(page.path.relative_to(WIKI_DIR))

    if dry_run:
        action = "update" if modifies else "new page"
        print(f"  [dry-run] promote {rel_src} → {target_rel} ({action})")
        return target_path

    # Clean frontmatter — strip staging-only fields
    new_fm = {k: v for k, v in page.frontmatter.items() if k not in STAGING_ONLY_FIELDS}
    new_fm.setdefault("origin", "automated")
    new_fm["last_verified"] = today().isoformat()
    if "last_compiled" not in new_fm:
        new_fm["last_compiled"] = today().isoformat()

    target_path.parent.mkdir(parents=True, exist_ok=True)
    old_path = page.path
    page.path = target_path
    page.frontmatter = new_fm
    write_page(page)
    old_path.unlink()

    # Rewrite cross-references: staging/... → target_rel
    rel_staging = str(old_path.relative_to(WIKI_DIR))
    _rewrite_cross_references(rel_staging, target_rel)

    # Update main index
    summary_text = page.body.strip().splitlines()[0] if page.body.strip() else ""
    _add_to_main_index(target_rel, new_fm.get("title", page.path.stem), summary_text[:120])

    # Regenerate staging index
    regenerate_staging_index()

    # Log to hygiene report (append a line)
    _append_log(f"promote | {rel_staging} → {target_rel}" + (f" (modifies {modifies})" if modifies else ""))
    return target_path


# ---------------------------------------------------------------------------
# Reject
# ---------------------------------------------------------------------------


def reject(page: WikiPage, reason: str, dry_run: bool = False) -> None:
    rel = str(page.path.relative_to(WIKI_DIR))
    if dry_run:
        print(f"  [dry-run] reject {rel} — {reason}")
        return

    # Record in harvest-state if this came from URL harvesting
    _record_rejection_in_harvest_state(page, reason)

    # Delete the file
    page.path.unlink()

    # Regenerate staging index
    regenerate_staging_index()

    _append_log(f"reject | {rel} — {reason}")
    print(f"  [rejected] {rel}")


def _record_rejection_in_harvest_state(page: WikiPage, reason: str) -> None:
    """If the staged page came from wiki-harvest, add the source URL to rejected_urls."""
    if not HARVEST_STATE_FILE.exists():
        return
    # Look for the source URL in frontmatter (harvest_source) or in sources field
    source_url = page.frontmatter.get("harvest_source")
    if not source_url:
        sources = page.frontmatter.get("sources") or []
        if isinstance(sources, list):
            for src in sources:
                src_str = str(src)
                # If src is a raw/harvested/... file, look up its source_url
                if "raw/harvested/" in src_str:
                    raw_path = WIKI_DIR / src_str
                    if raw_path.exists():
                        raw_page = parse_page(raw_path)
                        if raw_page:
                            source_url = raw_page.frontmatter.get("source_url")
                            break

    if not source_url:
        return

    try:
        with open(HARVEST_STATE_FILE) as f:
            state = json.load(f)
    except (OSError, json.JSONDecodeError):
        return

    state.setdefault("rejected_urls", {})[source_url] = {
        "reason": reason,
        "rejected_date": today().isoformat(),
    }
    # Remove from harvested_urls if present
    state.get("harvested_urls", {}).pop(source_url, None)

    with open(HARVEST_STATE_FILE, "w") as f:
        json.dump(state, f, indent=2, sort_keys=True)


# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------


def _append_log(line: str) -> None:
    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
    log = REPORTS_DIR / f"staging-{today().isoformat()}.log"
    with open(log, "a") as f:
        f.write(f"{line}\n")


# ---------------------------------------------------------------------------
# Path resolution
# ---------------------------------------------------------------------------


def resolve_page(raw_path: str) -> WikiPage | None:
    path = Path(raw_path)
    if not path.is_absolute():
        # Accept "staging/..." or just "patterns/foo.md" (assumes staging)
        if not raw_path.startswith("staging/") and raw_path.split("/", 1)[0] in LIVE_CONTENT_DIRS:
            path = STAGING_DIR / raw_path
        else:
            path = WIKI_DIR / raw_path
    if not path.exists():
        print(f"  [error] not found: {path}", file=sys.stderr)
        return None
    return parse_page(path)


# ---------------------------------------------------------------------------
# Commands
# ---------------------------------------------------------------------------


def cmd_list(as_json: bool = False) -> int:
    pending = list_pending()
    if as_json:
        data = [page_summary(p) for p in pending]
        print(json.dumps(data, indent=2))
        return 0

    if not pending:
        print("No pending items in staging.")
        return 0

    print(f"{len(pending)} pending item(s):\n")
    for p in pending:
        s = page_summary(p)
        age = f"{s['age_days']}d" if s["age_days"] is not None else "—"
        marker = " (update)" if s["modifies"] else ""
        print(f"  {s['path']}{marker}")
        print(f"    title:    {s['title']}")
        print(f"    type:     {s['type']}")
        print(f"    source:   {s['staged_by']}")
        print(f"    staged:   {s['staged_date']} ({age} old)")
        print(f"    target:   {s['target_path']}")
        if s["modifies"]:
            print(f"    modifies: {s['modifies']}")
        if s["compilation_notes"]:
            notes = s["compilation_notes"][:100]
            print(f"    notes:    {notes}")
        print()
    return 0


def cmd_stats() -> int:
    pending = list_pending()
    total = len(pending)
    if total == 0:
        print("No pending items in staging.")
        return 0

    by_type: dict[str, int] = {}
    by_source: dict[str, int] = {}
    ages: list[int] = []
    updates = 0

    for p in pending:
        s = page_summary(p)
        by_type[s["type"]] = by_type.get(s["type"], 0) + 1
        by_source[s["staged_by"]] = by_source.get(s["staged_by"], 0) + 1
        if s["age_days"] is not None:
            ages.append(s["age_days"])
        if s["modifies"]:
            updates += 1

    print(f"Total pending: {total}")
    print(f"Updates (modifies existing): {updates}")
    print(f"New pages: {total - updates}")
    print()
    print("By type:")
    for t, n in sorted(by_type.items()):
        print(f"  {t}: {n}")
    print()
    print("By source:")
    for s, n in sorted(by_source.items()):
        print(f"  {s}: {n}")
    if ages:
        print()
        print(f"Age (days): min={min(ages)}, max={max(ages)}, avg={sum(ages)//len(ages)}")
    return 0


def cmd_promote(path_arg: str, dry_run: bool) -> int:
    page = resolve_page(path_arg)
    if not page:
        return 1
    result = promote(page, dry_run=dry_run)
    if result and not dry_run:
        print(f"  [promoted] {result.relative_to(WIKI_DIR)}")
    return 0


def cmd_reject(path_arg: str, reason: str, dry_run: bool) -> int:
    page = resolve_page(path_arg)
    if not page:
        return 1
    reject(page, reason, dry_run=dry_run)
    return 0


def cmd_promote_all(dry_run: bool) -> int:
    pending = list_pending()
    if not pending:
        print("No pending items.")
        return 0
    print(f"Promoting {len(pending)} page(s)...")
    for p in pending:
        promote(p, dry_run=dry_run)
    return 0


def cmd_review() -> int:
    """Interactive review loop. Prompts approve/reject/skip for each pending item."""
    pending = list_pending()
    if not pending:
        print("No pending items.")
        return 0

    print(f"Reviewing {len(pending)} pending item(s). (a)pprove / (r)eject / (s)kip / (q)uit\n")
    for p in pending:
        s = page_summary(p)
        print(f"━━━ {s['path']} ━━━")
        print(f"  {s['title']}  ({s['type']})")
        print(f"  from:    {s['staged_by']} ({s['staged_date']})")
        print(f"  target:  {s['target_path']}")
        if s["modifies"]:
            print(f"  updates: {s['modifies']}")
        if s["compilation_notes"]:
            print(f"  notes:   {s['compilation_notes'][:150]}")
        # Show first few lines of body
        first_lines = [ln for ln in p.body.strip().splitlines() if ln.strip()][:3]
        for ln in first_lines:
            print(f"  │ {ln[:100]}")
        print()

        while True:
            try:
                answer = input("  [a/r/s/q] > ").strip().lower()
            except EOFError:
                return 0
            if answer in ("a", "approve"):
                promote(p)
                break
            if answer in ("r", "reject"):
                try:
                    reason = input("  reason > ").strip()
                except EOFError:
                    return 0
                reject(p, reason or "no reason given")
                break
            if answer in ("s", "skip"):
                break
            if answer in ("q", "quit"):
                return 0
        print()
    return 0


def cmd_sync() -> int:
    """Reconcile staging index after manual operations (Obsidian moves, deletions).

    Also detects pages that were manually moved out of staging without going through
    the promotion flow and reports them.
    """
    print("Regenerating staging index...")
    regenerate_staging_index()

    # Detect pages in live directories with status: pending (manual promotion without cleanup)
    leaked: list[Path] = []
    for page in iter_live_pages():
        if str(page.frontmatter.get("status", "")) == "pending":
            leaked.append(page.path)

    if leaked:
        print("\n[warn] live pages still marked status: pending — fix manually:")
        for p in leaked:
            print(f"  {p.relative_to(WIKI_DIR)}")

    pending = list_pending()
    print(f"\n{len(pending)} pending item(s) in staging.")
    return 0


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main() -> int:
    parser = argparse.ArgumentParser(description="Wiki staging pipeline")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--list", action="store_true", help="List pending items")
    group.add_argument("--stats", action="store_true", help="Summary stats")
    group.add_argument("--promote", metavar="PATH", help="Approve a pending page")
    group.add_argument("--reject", metavar="PATH", help="Reject a pending page")
    group.add_argument("--promote-all", action="store_true", help="Promote every pending page")
    group.add_argument("--review", action="store_true", help="Interactive approval loop")
    group.add_argument("--sync", action="store_true", help="Regenerate staging index & detect drift")

    parser.add_argument("--json", action="store_true", help="JSON output for --list")
    parser.add_argument("--reason", default="", help="Rejection reason for --reject")
    parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
    args = parser.parse_args()

    STAGING_DIR.mkdir(parents=True, exist_ok=True)

    if args.list:
        return cmd_list(as_json=args.json)
    if args.stats:
        return cmd_stats()
    if args.promote:
        return cmd_promote(args.promote, args.dry_run)
    if args.reject:
        if not args.reason:
            print("--reject requires --reason", file=sys.stderr)
            return 2
        return cmd_reject(args.reject, args.reason, args.dry_run)
    if args.promote_all:
        return cmd_promote_all(args.dry_run)
    if args.review:
        return cmd_review()
    if args.sync:
        return cmd_sync()
    return 0


if __name__ == "__main__":
    sys.exit(main())