Initial commit — memex

A compounding LLM-maintained knowledge wiki. Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's mempalace, with an automation layer on top for conversation mining, URL harvesting, human-in-the-loop staging, staleness decay, and hygiene. Includes: - 11 pipeline scripts (extract, summarize, index, harvest, stage, hygiene, maintain, sync, + shared library) - Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE - Example CLAUDE.md files (wiki schema + global instructions) tuned for the three-collection qmd setup - 171-test pytest suite (cross-platform, runs in ~1.3s) - MIT licensed
2026-04-12 21:16:02 -06:00
commit ee54a2f5d4
31 changed files with 10792 additions and 0 deletions
--- a/scripts/wiki-staging.py
+++ b/scripts/wiki-staging.py
@@ -0,0 +1,639 @@
+#!/usr/bin/env python3
+"""Human-in-the-loop staging pipeline for wiki content.
+
+Pure file operations — no LLM calls. Moves pages between staging/ and the live
+wiki, updates indexes, rewrites cross-references, and tracks rejections in
+.harvest-state.json.
+
+Usage:
+    python3 scripts/wiki-staging.py --list                       # List pending items
+    python3 scripts/wiki-staging.py --list --json                # JSON output
+    python3 scripts/wiki-staging.py --stats                      # Summary by type and age
+    python3 scripts/wiki-staging.py --promote PATH               # Approve one page
+    python3 scripts/wiki-staging.py --reject PATH --reason "..." # Reject with reason
+    python3 scripts/wiki-staging.py --promote-all                # Approve everything
+    python3 scripts/wiki-staging.py --review                     # Interactive approval loop
+    python3 scripts/wiki-staging.py --sync                       # Rebuild staging/index.md
+
+PATH may be relative to the wiki root (e.g. `staging/patterns/foo.md`) or absolute.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from datetime import date
+from pathlib import Path
+from typing import Any
+
+# Import shared helpers
+sys.path.insert(0, str(Path(__file__).parent))
+from wiki_lib import (  # noqa: E402
+    ARCHIVE_DIR,
+    CONVERSATIONS_DIR,
+    HARVEST_STATE_FILE,
+    INDEX_FILE,
+    LIVE_CONTENT_DIRS,
+    REPORTS_DIR,
+    STAGING_DIR,
+    STAGING_INDEX,
+    WIKI_DIR,
+    WikiPage,
+    iter_live_pages,
+    iter_staging_pages,
+    parse_date,
+    parse_page,
+    today,
+    write_page,
+)
+
+sys.stdout.reconfigure(line_buffering=True)
+sys.stderr.reconfigure(line_buffering=True)
+
+# Fields stripped from frontmatter on promotion (staging-only metadata)
+STAGING_ONLY_FIELDS = [
+    "status",
+    "staged_date",
+    "staged_by",
+    "target_path",
+    "modifies",
+    "compilation_notes",
+]
+
+# ---------------------------------------------------------------------------
+# Discovery
+# ---------------------------------------------------------------------------
+
+
+def list_pending() -> list[WikiPage]:
+    pages = [p for p in iter_staging_pages() if p.path.name != "index.md"]
+    return pages
+
+
+def page_summary(page: WikiPage) -> dict[str, Any]:
+    rel = str(page.path.relative_to(WIKI_DIR))
+    fm = page.frontmatter
+    target = fm.get("target_path") or _infer_target_path(page)
+    staged = parse_date(fm.get("staged_date"))
+    age = (today() - staged).days if staged else None
+    return {
+        "path": rel,
+        "title": fm.get("title", page.path.stem),
+        "type": fm.get("type", _infer_type(page)),
+        "status": fm.get("status", "pending"),
+        "origin": fm.get("origin", "automated"),
+        "staged_by": fm.get("staged_by", "unknown"),
+        "staged_date": str(staged) if staged else None,
+        "age_days": age,
+        "target_path": target,
+        "modifies": fm.get("modifies"),
+        "compilation_notes": fm.get("compilation_notes", ""),
+    }
+
+
+def _infer_target_path(page: WikiPage) -> str:
+    """Derive a target path when target_path isn't set in frontmatter."""
+    try:
+        rel = page.path.relative_to(STAGING_DIR)
+    except ValueError:
+        return str(page.path.relative_to(WIKI_DIR))
+    return str(rel)
+
+
+def _infer_type(page: WikiPage) -> str:
+    """Infer type from the directory name when frontmatter doesn't specify it."""
+    parts = page.path.relative_to(STAGING_DIR).parts
+    if len(parts) >= 2 and parts[0] in LIVE_CONTENT_DIRS:
+        return parts[0].rstrip("s")  # 'patterns' → 'pattern'
+    return "unknown"
+
+
+# ---------------------------------------------------------------------------
+# Main index update
+# ---------------------------------------------------------------------------
+
+
+def _remove_from_main_index(rel_path: str) -> None:
+    if not INDEX_FILE.exists():
+        return
+    text = INDEX_FILE.read_text()
+    lines = text.splitlines(keepends=True)
+    pattern = re.compile(rf"^- \[.+\]\({re.escape(rel_path)}\) ")
+    new_lines = [line for line in lines if not pattern.match(line)]
+    if len(new_lines) != len(lines):
+        INDEX_FILE.write_text("".join(new_lines))
+
+
+def _add_to_main_index(rel_path: str, title: str, summary: str = "") -> None:
+    """Append a new entry under the appropriate section. Best-effort — operator may re-order later."""
+    if not INDEX_FILE.exists():
+        return
+    text = INDEX_FILE.read_text()
+    # Avoid duplicates
+    if f"]({rel_path})" in text:
+        return
+    entry = f"- [{title}]({rel_path})"
+    if summary:
+        entry += f" — {summary}"
+    entry += "\n"
+    # Insert at the end of the first matching section
+    ptype = rel_path.split("/")[0]
+    section_headers = {
+        "patterns": "## Patterns",
+        "decisions": "## Decisions",
+        "concepts": "## Concepts",
+        "environments": "## Environments",
+    }
+    header = section_headers.get(ptype)
+    if header and header in text:
+        # Find the header and append before the next ## header or EOF
+        idx = text.find(header)
+        next_header = text.find("\n## ", idx + len(header))
+        if next_header == -1:
+            next_header = len(text)
+        # Find the last non-empty line in the section
+        section = text[idx:next_header]
+        last_nl = section.rfind("\n", 0, len(section) - 1) + 1
+        INDEX_FILE.write_text(text[: idx + last_nl] + entry + text[idx + last_nl :])
+    else:
+        INDEX_FILE.write_text(text.rstrip() + "\n" + entry)
+
+
+# ---------------------------------------------------------------------------
+# Staging index update
+# ---------------------------------------------------------------------------
+
+
+def regenerate_staging_index() -> None:
+    STAGING_DIR.mkdir(parents=True, exist_ok=True)
+    pending = list_pending()
+
+    lines = [
+        "# Staging — Pending Wiki Content",
+        "",
+        "Content awaiting human review. These pages were generated by automated scripts",
+        "and need approval before joining the live wiki.",
+        "",
+        "**Review options**:",
+        "- Browse in Obsidian and move files manually (then run `scripts/wiki-staging.py --sync`)",
+        "- Run `python3 scripts/wiki-staging.py --list` for a summary",
+        "- Start a Claude session: \"let's review what's in staging\"",
+        "",
+        f"**{len(pending)} pending item(s)** as of {today().isoformat()}",
+        "",
+        "## Pending Items",
+        "",
+    ]
+
+    if not pending:
+        lines.append("_No pending items._")
+    else:
+        lines.append("| Page | Type | Source | Staged | Age | Target |")
+        lines.append("|------|------|--------|--------|-----|--------|")
+        for page in pending:
+            s = page_summary(page)
+            title = s["title"]
+            rel_in_staging = str(page.path.relative_to(STAGING_DIR))
+            age = f"{s['age_days']}d" if s["age_days"] is not None else "—"
+            staged = s["staged_date"] or "—"
+            lines.append(
+                f"| [{title}]({rel_in_staging}) | {s['type']} | "
+                f"{s['staged_by']} | {staged} | {age} | `{s['target_path']}` |"
+            )
+
+    STAGING_INDEX.write_text("\n".join(lines) + "\n")
+
+
+# ---------------------------------------------------------------------------
+# Cross-reference rewriting
+# ---------------------------------------------------------------------------
+
+
+def _rewrite_cross_references(old_path: str, new_path: str) -> int:
+    """Rewrite links and `related:` entries across the wiki."""
+    targets: list[Path] = [INDEX_FILE]
+    for sub in LIVE_CONTENT_DIRS:
+        targets.extend((WIKI_DIR / sub).glob("*.md"))
+    if STAGING_DIR.exists():
+        for sub in LIVE_CONTENT_DIRS:
+            targets.extend((STAGING_DIR / sub).glob("*.md"))
+    if ARCHIVE_DIR.exists():
+        for sub in LIVE_CONTENT_DIRS:
+            targets.extend((ARCHIVE_DIR / sub).glob("*.md"))
+
+    count = 0
+    old_esc = re.escape(old_path)
+    link_patterns = [
+        (re.compile(rf"\]\({old_esc}\)"), f"]({new_path})"),
+        (re.compile(rf"\]\(\.\./{old_esc}\)"), f"](../{new_path})"),
+    ]
+    related_patterns = [
+        (re.compile(rf"^(\s*-\s*){old_esc}$", re.MULTILINE), rf"\g<1>{new_path}"),
+    ]
+    for target in targets:
+        if not target.exists():
+            continue
+        try:
+            text = target.read_text()
+        except OSError:
+            continue
+        new_text = text
+        for pat, repl in link_patterns + related_patterns:
+            new_text = pat.sub(repl, new_text)
+        if new_text != text:
+            target.write_text(new_text)
+            count += 1
+    return count
+
+
+# ---------------------------------------------------------------------------
+# Promote
+# ---------------------------------------------------------------------------
+
+
+def promote(page: WikiPage, dry_run: bool = False) -> Path | None:
+    summary = page_summary(page)
+    target_rel = summary["target_path"]
+    target_path = WIKI_DIR / target_rel
+
+    modifies = summary["modifies"]
+    if modifies:
+        # This is an update to an existing page. Merge: keep staging content,
+        # preserve the live page's origin if it was manual.
+        live_path = WIKI_DIR / modifies
+        if not live_path.exists():
+            print(
+                f"  [warn] modifies target {modifies} does not exist — treating as new page",
+                file=sys.stderr,
+            )
+            modifies = None
+        else:
+            live_page = parse_page(live_path)
+            if live_page:
+                # Warn if live page has been updated since staging
+                live_compiled = parse_date(live_page.frontmatter.get("last_compiled"))
+                staged = parse_date(page.frontmatter.get("staged_date"))
+                if live_compiled and staged and live_compiled > staged:
+                    print(
+                        f"  [warn] live page {modifies} was updated ({live_compiled}) "
+                        f"after staging ({staged}) — human should verify merge",
+                        file=sys.stderr,
+                    )
+                # Preserve origin from live if it was manual
+                if live_page.frontmatter.get("origin") == "manual":
+                    page.frontmatter["origin"] = "manual"
+
+    rel_src = str(page.path.relative_to(WIKI_DIR))
+
+    if dry_run:
+        action = "update" if modifies else "new page"
+        print(f"  [dry-run] promote {rel_src} → {target_rel} ({action})")
+        return target_path
+
+    # Clean frontmatter — strip staging-only fields
+    new_fm = {k: v for k, v in page.frontmatter.items() if k not in STAGING_ONLY_FIELDS}
+    new_fm.setdefault("origin", "automated")
+    new_fm["last_verified"] = today().isoformat()
+    if "last_compiled" not in new_fm:
+        new_fm["last_compiled"] = today().isoformat()
+
+    target_path.parent.mkdir(parents=True, exist_ok=True)
+    old_path = page.path
+    page.path = target_path
+    page.frontmatter = new_fm
+    write_page(page)
+    old_path.unlink()
+
+    # Rewrite cross-references: staging/... → target_rel
+    rel_staging = str(old_path.relative_to(WIKI_DIR))
+    _rewrite_cross_references(rel_staging, target_rel)
+
+    # Update main index
+    summary_text = page.body.strip().splitlines()[0] if page.body.strip() else ""
+    _add_to_main_index(target_rel, new_fm.get("title", page.path.stem), summary_text[:120])
+
+    # Regenerate staging index
+    regenerate_staging_index()
+
+    # Log to hygiene report (append a line)
+    _append_log(f"promote | {rel_staging} → {target_rel}" + (f" (modifies {modifies})" if modifies else ""))
+    return target_path
+
+
+# ---------------------------------------------------------------------------
+# Reject
+# ---------------------------------------------------------------------------
+
+
+def reject(page: WikiPage, reason: str, dry_run: bool = False) -> None:
+    rel = str(page.path.relative_to(WIKI_DIR))
+    if dry_run:
+        print(f"  [dry-run] reject {rel} — {reason}")
+        return
+
+    # Record in harvest-state if this came from URL harvesting
+    _record_rejection_in_harvest_state(page, reason)
+
+    # Delete the file
+    page.path.unlink()
+
+    # Regenerate staging index
+    regenerate_staging_index()
+
+    _append_log(f"reject | {rel} — {reason}")
+    print(f"  [rejected] {rel}")
+
+
+def _record_rejection_in_harvest_state(page: WikiPage, reason: str) -> None:
+    """If the staged page came from wiki-harvest, add the source URL to rejected_urls."""
+    if not HARVEST_STATE_FILE.exists():
+        return
+    # Look for the source URL in frontmatter (harvest_source) or in sources field
+    source_url = page.frontmatter.get("harvest_source")
+    if not source_url:
+        sources = page.frontmatter.get("sources") or []
+        if isinstance(sources, list):
+            for src in sources:
+                src_str = str(src)
+                # If src is a raw/harvested/... file, look up its source_url
+                if "raw/harvested/" in src_str:
+                    raw_path = WIKI_DIR / src_str
+                    if raw_path.exists():
+                        raw_page = parse_page(raw_path)
+                        if raw_page:
+                            source_url = raw_page.frontmatter.get("source_url")
+                            break
+
+    if not source_url:
+        return
+
+    try:
+        with open(HARVEST_STATE_FILE) as f:
+            state = json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return
+
+    state.setdefault("rejected_urls", {})[source_url] = {
+        "reason": reason,
+        "rejected_date": today().isoformat(),
+    }
+    # Remove from harvested_urls if present
+    state.get("harvested_urls", {}).pop(source_url, None)
+
+    with open(HARVEST_STATE_FILE, "w") as f:
+        json.dump(state, f, indent=2, sort_keys=True)
+
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+
+
+def _append_log(line: str) -> None:
+    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
+    log = REPORTS_DIR / f"staging-{today().isoformat()}.log"
+    with open(log, "a") as f:
+        f.write(f"{line}\n")
+
+
+# ---------------------------------------------------------------------------
+# Path resolution
+# ---------------------------------------------------------------------------
+
+
+def resolve_page(raw_path: str) -> WikiPage | None:
+    path = Path(raw_path)
+    if not path.is_absolute():
+        # Accept "staging/..." or just "patterns/foo.md" (assumes staging)
+        if not raw_path.startswith("staging/") and raw_path.split("/", 1)[0] in LIVE_CONTENT_DIRS:
+            path = STAGING_DIR / raw_path
+        else:
+            path = WIKI_DIR / raw_path
+    if not path.exists():
+        print(f"  [error] not found: {path}", file=sys.stderr)
+        return None
+    return parse_page(path)
+
+
+# ---------------------------------------------------------------------------
+# Commands
+# ---------------------------------------------------------------------------
+
+
+def cmd_list(as_json: bool = False) -> int:
+    pending = list_pending()
+    if as_json:
+        data = [page_summary(p) for p in pending]
+        print(json.dumps(data, indent=2))
+        return 0
+
+    if not pending:
+        print("No pending items in staging.")
+        return 0
+
+    print(f"{len(pending)} pending item(s):\n")
+    for p in pending:
+        s = page_summary(p)
+        age = f"{s['age_days']}d" if s["age_days"] is not None else "—"
+        marker = " (update)" if s["modifies"] else ""
+        print(f"  {s['path']}{marker}")
+        print(f"    title:    {s['title']}")
+        print(f"    type:     {s['type']}")
+        print(f"    source:   {s['staged_by']}")
+        print(f"    staged:   {s['staged_date']} ({age} old)")
+        print(f"    target:   {s['target_path']}")
+        if s["modifies"]:
+            print(f"    modifies: {s['modifies']}")
+        if s["compilation_notes"]:
+            notes = s["compilation_notes"][:100]
+            print(f"    notes:    {notes}")
+        print()
+    return 0
+
+
+def cmd_stats() -> int:
+    pending = list_pending()
+    total = len(pending)
+    if total == 0:
+        print("No pending items in staging.")
+        return 0
+
+    by_type: dict[str, int] = {}
+    by_source: dict[str, int] = {}
+    ages: list[int] = []
+    updates = 0
+
+    for p in pending:
+        s = page_summary(p)
+        by_type[s["type"]] = by_type.get(s["type"], 0) + 1
+        by_source[s["staged_by"]] = by_source.get(s["staged_by"], 0) + 1
+        if s["age_days"] is not None:
+            ages.append(s["age_days"])
+        if s["modifies"]:
+            updates += 1
+
+    print(f"Total pending: {total}")
+    print(f"Updates (modifies existing): {updates}")
+    print(f"New pages: {total - updates}")
+    print()
+    print("By type:")
+    for t, n in sorted(by_type.items()):
+        print(f"  {t}: {n}")
+    print()
+    print("By source:")
+    for s, n in sorted(by_source.items()):
+        print(f"  {s}: {n}")
+    if ages:
+        print()
+        print(f"Age (days): min={min(ages)}, max={max(ages)}, avg={sum(ages)//len(ages)}")
+    return 0
+
+
+def cmd_promote(path_arg: str, dry_run: bool) -> int:
+    page = resolve_page(path_arg)
+    if not page:
+        return 1
+    result = promote(page, dry_run=dry_run)
+    if result and not dry_run:
+        print(f"  [promoted] {result.relative_to(WIKI_DIR)}")
+    return 0
+
+
+def cmd_reject(path_arg: str, reason: str, dry_run: bool) -> int:
+    page = resolve_page(path_arg)
+    if not page:
+        return 1
+    reject(page, reason, dry_run=dry_run)
+    return 0
+
+
+def cmd_promote_all(dry_run: bool) -> int:
+    pending = list_pending()
+    if not pending:
+        print("No pending items.")
+        return 0
+    print(f"Promoting {len(pending)} page(s)...")
+    for p in pending:
+        promote(p, dry_run=dry_run)
+    return 0
+
+
+def cmd_review() -> int:
+    """Interactive review loop. Prompts approve/reject/skip for each pending item."""
+    pending = list_pending()
+    if not pending:
+        print("No pending items.")
+        return 0
+
+    print(f"Reviewing {len(pending)} pending item(s). (a)pprove / (r)eject / (s)kip / (q)uit\n")
+    for p in pending:
+        s = page_summary(p)
+        print(f"━━━ {s['path']} ━━━")
+        print(f"  {s['title']}  ({s['type']})")
+        print(f"  from:    {s['staged_by']} ({s['staged_date']})")
+        print(f"  target:  {s['target_path']}")
+        if s["modifies"]:
+            print(f"  updates: {s['modifies']}")
+        if s["compilation_notes"]:
+            print(f"  notes:   {s['compilation_notes'][:150]}")
+        # Show first few lines of body
+        first_lines = [ln for ln in p.body.strip().splitlines() if ln.strip()][:3]
+        for ln in first_lines:
+            print(f"  │ {ln[:100]}")
+        print()
+
+        while True:
+            try:
+                answer = input("  [a/r/s/q] > ").strip().lower()
+            except EOFError:
+                return 0
+            if answer in ("a", "approve"):
+                promote(p)
+                break
+            if answer in ("r", "reject"):
+                try:
+                    reason = input("  reason > ").strip()
+                except EOFError:
+                    return 0
+                reject(p, reason or "no reason given")
+                break
+            if answer in ("s", "skip"):
+                break
+            if answer in ("q", "quit"):
+                return 0
+        print()
+    return 0
+
+
+def cmd_sync() -> int:
+    """Reconcile staging index after manual operations (Obsidian moves, deletions).
+
+    Also detects pages that were manually moved out of staging without going through
+    the promotion flow and reports them.
+    """
+    print("Regenerating staging index...")
+    regenerate_staging_index()
+
+    # Detect pages in live directories with status: pending (manual promotion without cleanup)
+    leaked: list[Path] = []
+    for page in iter_live_pages():
+        if str(page.frontmatter.get("status", "")) == "pending":
+            leaked.append(page.path)
+
+    if leaked:
+        print("\n[warn] live pages still marked status: pending — fix manually:")
+        for p in leaked:
+            print(f"  {p.relative_to(WIKI_DIR)}")
+
+    pending = list_pending()
+    print(f"\n{len(pending)} pending item(s) in staging.")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Wiki staging pipeline")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--list", action="store_true", help="List pending items")
+    group.add_argument("--stats", action="store_true", help="Summary stats")
+    group.add_argument("--promote", metavar="PATH", help="Approve a pending page")
+    group.add_argument("--reject", metavar="PATH", help="Reject a pending page")
+    group.add_argument("--promote-all", action="store_true", help="Promote every pending page")
+    group.add_argument("--review", action="store_true", help="Interactive approval loop")
+    group.add_argument("--sync", action="store_true", help="Regenerate staging index & detect drift")
+
+    parser.add_argument("--json", action="store_true", help="JSON output for --list")
+    parser.add_argument("--reason", default="", help="Rejection reason for --reject")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
+    args = parser.parse_args()
+
+    STAGING_DIR.mkdir(parents=True, exist_ok=True)
+
+    if args.list:
+        return cmd_list(as_json=args.json)
+    if args.stats:
+        return cmd_stats()
+    if args.promote:
+        return cmd_promote(args.promote, args.dry_run)
+    if args.reject:
+        if not args.reason:
+            print("--reject requires --reason", file=sys.stderr)
+            return 2
+        return cmd_reject(args.reject, args.reason, args.dry_run)
+    if args.promote_all:
+        return cmd_promote_all(args.dry_run)
+    if args.review:
+        return cmd_review()
+    if args.sync:
+        return cmd_sync()
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())