Files
memex/scripts/wiki-staging.py
Eric Turner ee54a2f5d4 Initial commit — memex
A compounding LLM-maintained knowledge wiki.

Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's
mempalace, with an automation layer on top for conversation mining, URL
harvesting, human-in-the-loop staging, staleness decay, and hygiene.

Includes:
- 11 pipeline scripts (extract, summarize, index, harvest, stage,
  hygiene, maintain, sync, + shared library)
- Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE
- Example CLAUDE.md files (wiki schema + global instructions) tuned for
  the three-collection qmd setup
- 171-test pytest suite (cross-platform, runs in ~1.3s)
- MIT licensed
2026-04-12 21:16:02 -06:00

640 lines
22 KiB
Python
Executable File

#!/usr/bin/env python3
"""Human-in-the-loop staging pipeline for wiki content.
Pure file operations — no LLM calls. Moves pages between staging/ and the live
wiki, updates indexes, rewrites cross-references, and tracks rejections in
.harvest-state.json.
Usage:
python3 scripts/wiki-staging.py --list # List pending items
python3 scripts/wiki-staging.py --list --json # JSON output
python3 scripts/wiki-staging.py --stats # Summary by type and age
python3 scripts/wiki-staging.py --promote PATH # Approve one page
python3 scripts/wiki-staging.py --reject PATH --reason "..." # Reject with reason
python3 scripts/wiki-staging.py --promote-all # Approve everything
python3 scripts/wiki-staging.py --review # Interactive approval loop
python3 scripts/wiki-staging.py --sync # Rebuild staging/index.md
PATH may be relative to the wiki root (e.g. `staging/patterns/foo.md`) or absolute.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from datetime import date
from pathlib import Path
from typing import Any
# Import shared helpers
sys.path.insert(0, str(Path(__file__).parent))
from wiki_lib import ( # noqa: E402
ARCHIVE_DIR,
CONVERSATIONS_DIR,
HARVEST_STATE_FILE,
INDEX_FILE,
LIVE_CONTENT_DIRS,
REPORTS_DIR,
STAGING_DIR,
STAGING_INDEX,
WIKI_DIR,
WikiPage,
iter_live_pages,
iter_staging_pages,
parse_date,
parse_page,
today,
write_page,
)
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
# Fields stripped from frontmatter on promotion (staging-only metadata)
STAGING_ONLY_FIELDS = [
"status",
"staged_date",
"staged_by",
"target_path",
"modifies",
"compilation_notes",
]
# ---------------------------------------------------------------------------
# Discovery
# ---------------------------------------------------------------------------
def list_pending() -> list[WikiPage]:
pages = [p for p in iter_staging_pages() if p.path.name != "index.md"]
return pages
def page_summary(page: WikiPage) -> dict[str, Any]:
rel = str(page.path.relative_to(WIKI_DIR))
fm = page.frontmatter
target = fm.get("target_path") or _infer_target_path(page)
staged = parse_date(fm.get("staged_date"))
age = (today() - staged).days if staged else None
return {
"path": rel,
"title": fm.get("title", page.path.stem),
"type": fm.get("type", _infer_type(page)),
"status": fm.get("status", "pending"),
"origin": fm.get("origin", "automated"),
"staged_by": fm.get("staged_by", "unknown"),
"staged_date": str(staged) if staged else None,
"age_days": age,
"target_path": target,
"modifies": fm.get("modifies"),
"compilation_notes": fm.get("compilation_notes", ""),
}
def _infer_target_path(page: WikiPage) -> str:
"""Derive a target path when target_path isn't set in frontmatter."""
try:
rel = page.path.relative_to(STAGING_DIR)
except ValueError:
return str(page.path.relative_to(WIKI_DIR))
return str(rel)
def _infer_type(page: WikiPage) -> str:
"""Infer type from the directory name when frontmatter doesn't specify it."""
parts = page.path.relative_to(STAGING_DIR).parts
if len(parts) >= 2 and parts[0] in LIVE_CONTENT_DIRS:
return parts[0].rstrip("s") # 'patterns' → 'pattern'
return "unknown"
# ---------------------------------------------------------------------------
# Main index update
# ---------------------------------------------------------------------------
def _remove_from_main_index(rel_path: str) -> None:
if not INDEX_FILE.exists():
return
text = INDEX_FILE.read_text()
lines = text.splitlines(keepends=True)
pattern = re.compile(rf"^- \[.+\]\({re.escape(rel_path)}\) ")
new_lines = [line for line in lines if not pattern.match(line)]
if len(new_lines) != len(lines):
INDEX_FILE.write_text("".join(new_lines))
def _add_to_main_index(rel_path: str, title: str, summary: str = "") -> None:
"""Append a new entry under the appropriate section. Best-effort — operator may re-order later."""
if not INDEX_FILE.exists():
return
text = INDEX_FILE.read_text()
# Avoid duplicates
if f"]({rel_path})" in text:
return
entry = f"- [{title}]({rel_path})"
if summary:
entry += f"{summary}"
entry += "\n"
# Insert at the end of the first matching section
ptype = rel_path.split("/")[0]
section_headers = {
"patterns": "## Patterns",
"decisions": "## Decisions",
"concepts": "## Concepts",
"environments": "## Environments",
}
header = section_headers.get(ptype)
if header and header in text:
# Find the header and append before the next ## header or EOF
idx = text.find(header)
next_header = text.find("\n## ", idx + len(header))
if next_header == -1:
next_header = len(text)
# Find the last non-empty line in the section
section = text[idx:next_header]
last_nl = section.rfind("\n", 0, len(section) - 1) + 1
INDEX_FILE.write_text(text[: idx + last_nl] + entry + text[idx + last_nl :])
else:
INDEX_FILE.write_text(text.rstrip() + "\n" + entry)
# ---------------------------------------------------------------------------
# Staging index update
# ---------------------------------------------------------------------------
def regenerate_staging_index() -> None:
STAGING_DIR.mkdir(parents=True, exist_ok=True)
pending = list_pending()
lines = [
"# Staging — Pending Wiki Content",
"",
"Content awaiting human review. These pages were generated by automated scripts",
"and need approval before joining the live wiki.",
"",
"**Review options**:",
"- Browse in Obsidian and move files manually (then run `scripts/wiki-staging.py --sync`)",
"- Run `python3 scripts/wiki-staging.py --list` for a summary",
"- Start a Claude session: \"let's review what's in staging\"",
"",
f"**{len(pending)} pending item(s)** as of {today().isoformat()}",
"",
"## Pending Items",
"",
]
if not pending:
lines.append("_No pending items._")
else:
lines.append("| Page | Type | Source | Staged | Age | Target |")
lines.append("|------|------|--------|--------|-----|--------|")
for page in pending:
s = page_summary(page)
title = s["title"]
rel_in_staging = str(page.path.relative_to(STAGING_DIR))
age = f"{s['age_days']}d" if s["age_days"] is not None else ""
staged = s["staged_date"] or ""
lines.append(
f"| [{title}]({rel_in_staging}) | {s['type']} | "
f"{s['staged_by']} | {staged} | {age} | `{s['target_path']}` |"
)
STAGING_INDEX.write_text("\n".join(lines) + "\n")
# ---------------------------------------------------------------------------
# Cross-reference rewriting
# ---------------------------------------------------------------------------
def _rewrite_cross_references(old_path: str, new_path: str) -> int:
"""Rewrite links and `related:` entries across the wiki."""
targets: list[Path] = [INDEX_FILE]
for sub in LIVE_CONTENT_DIRS:
targets.extend((WIKI_DIR / sub).glob("*.md"))
if STAGING_DIR.exists():
for sub in LIVE_CONTENT_DIRS:
targets.extend((STAGING_DIR / sub).glob("*.md"))
if ARCHIVE_DIR.exists():
for sub in LIVE_CONTENT_DIRS:
targets.extend((ARCHIVE_DIR / sub).glob("*.md"))
count = 0
old_esc = re.escape(old_path)
link_patterns = [
(re.compile(rf"\]\({old_esc}\)"), f"]({new_path})"),
(re.compile(rf"\]\(\.\./{old_esc}\)"), f"](../{new_path})"),
]
related_patterns = [
(re.compile(rf"^(\s*-\s*){old_esc}$", re.MULTILINE), rf"\g<1>{new_path}"),
]
for target in targets:
if not target.exists():
continue
try:
text = target.read_text()
except OSError:
continue
new_text = text
for pat, repl in link_patterns + related_patterns:
new_text = pat.sub(repl, new_text)
if new_text != text:
target.write_text(new_text)
count += 1
return count
# ---------------------------------------------------------------------------
# Promote
# ---------------------------------------------------------------------------
def promote(page: WikiPage, dry_run: bool = False) -> Path | None:
summary = page_summary(page)
target_rel = summary["target_path"]
target_path = WIKI_DIR / target_rel
modifies = summary["modifies"]
if modifies:
# This is an update to an existing page. Merge: keep staging content,
# preserve the live page's origin if it was manual.
live_path = WIKI_DIR / modifies
if not live_path.exists():
print(
f" [warn] modifies target {modifies} does not exist — treating as new page",
file=sys.stderr,
)
modifies = None
else:
live_page = parse_page(live_path)
if live_page:
# Warn if live page has been updated since staging
live_compiled = parse_date(live_page.frontmatter.get("last_compiled"))
staged = parse_date(page.frontmatter.get("staged_date"))
if live_compiled and staged and live_compiled > staged:
print(
f" [warn] live page {modifies} was updated ({live_compiled}) "
f"after staging ({staged}) — human should verify merge",
file=sys.stderr,
)
# Preserve origin from live if it was manual
if live_page.frontmatter.get("origin") == "manual":
page.frontmatter["origin"] = "manual"
rel_src = str(page.path.relative_to(WIKI_DIR))
if dry_run:
action = "update" if modifies else "new page"
print(f" [dry-run] promote {rel_src}{target_rel} ({action})")
return target_path
# Clean frontmatter — strip staging-only fields
new_fm = {k: v for k, v in page.frontmatter.items() if k not in STAGING_ONLY_FIELDS}
new_fm.setdefault("origin", "automated")
new_fm["last_verified"] = today().isoformat()
if "last_compiled" not in new_fm:
new_fm["last_compiled"] = today().isoformat()
target_path.parent.mkdir(parents=True, exist_ok=True)
old_path = page.path
page.path = target_path
page.frontmatter = new_fm
write_page(page)
old_path.unlink()
# Rewrite cross-references: staging/... → target_rel
rel_staging = str(old_path.relative_to(WIKI_DIR))
_rewrite_cross_references(rel_staging, target_rel)
# Update main index
summary_text = page.body.strip().splitlines()[0] if page.body.strip() else ""
_add_to_main_index(target_rel, new_fm.get("title", page.path.stem), summary_text[:120])
# Regenerate staging index
regenerate_staging_index()
# Log to hygiene report (append a line)
_append_log(f"promote | {rel_staging}{target_rel}" + (f" (modifies {modifies})" if modifies else ""))
return target_path
# ---------------------------------------------------------------------------
# Reject
# ---------------------------------------------------------------------------
def reject(page: WikiPage, reason: str, dry_run: bool = False) -> None:
rel = str(page.path.relative_to(WIKI_DIR))
if dry_run:
print(f" [dry-run] reject {rel}{reason}")
return
# Record in harvest-state if this came from URL harvesting
_record_rejection_in_harvest_state(page, reason)
# Delete the file
page.path.unlink()
# Regenerate staging index
regenerate_staging_index()
_append_log(f"reject | {rel}{reason}")
print(f" [rejected] {rel}")
def _record_rejection_in_harvest_state(page: WikiPage, reason: str) -> None:
"""If the staged page came from wiki-harvest, add the source URL to rejected_urls."""
if not HARVEST_STATE_FILE.exists():
return
# Look for the source URL in frontmatter (harvest_source) or in sources field
source_url = page.frontmatter.get("harvest_source")
if not source_url:
sources = page.frontmatter.get("sources") or []
if isinstance(sources, list):
for src in sources:
src_str = str(src)
# If src is a raw/harvested/... file, look up its source_url
if "raw/harvested/" in src_str:
raw_path = WIKI_DIR / src_str
if raw_path.exists():
raw_page = parse_page(raw_path)
if raw_page:
source_url = raw_page.frontmatter.get("source_url")
break
if not source_url:
return
try:
with open(HARVEST_STATE_FILE) as f:
state = json.load(f)
except (OSError, json.JSONDecodeError):
return
state.setdefault("rejected_urls", {})[source_url] = {
"reason": reason,
"rejected_date": today().isoformat(),
}
# Remove from harvested_urls if present
state.get("harvested_urls", {}).pop(source_url, None)
with open(HARVEST_STATE_FILE, "w") as f:
json.dump(state, f, indent=2, sort_keys=True)
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
def _append_log(line: str) -> None:
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
log = REPORTS_DIR / f"staging-{today().isoformat()}.log"
with open(log, "a") as f:
f.write(f"{line}\n")
# ---------------------------------------------------------------------------
# Path resolution
# ---------------------------------------------------------------------------
def resolve_page(raw_path: str) -> WikiPage | None:
path = Path(raw_path)
if not path.is_absolute():
# Accept "staging/..." or just "patterns/foo.md" (assumes staging)
if not raw_path.startswith("staging/") and raw_path.split("/", 1)[0] in LIVE_CONTENT_DIRS:
path = STAGING_DIR / raw_path
else:
path = WIKI_DIR / raw_path
if not path.exists():
print(f" [error] not found: {path}", file=sys.stderr)
return None
return parse_page(path)
# ---------------------------------------------------------------------------
# Commands
# ---------------------------------------------------------------------------
def cmd_list(as_json: bool = False) -> int:
pending = list_pending()
if as_json:
data = [page_summary(p) for p in pending]
print(json.dumps(data, indent=2))
return 0
if not pending:
print("No pending items in staging.")
return 0
print(f"{len(pending)} pending item(s):\n")
for p in pending:
s = page_summary(p)
age = f"{s['age_days']}d" if s["age_days"] is not None else ""
marker = " (update)" if s["modifies"] else ""
print(f" {s['path']}{marker}")
print(f" title: {s['title']}")
print(f" type: {s['type']}")
print(f" source: {s['staged_by']}")
print(f" staged: {s['staged_date']} ({age} old)")
print(f" target: {s['target_path']}")
if s["modifies"]:
print(f" modifies: {s['modifies']}")
if s["compilation_notes"]:
notes = s["compilation_notes"][:100]
print(f" notes: {notes}")
print()
return 0
def cmd_stats() -> int:
pending = list_pending()
total = len(pending)
if total == 0:
print("No pending items in staging.")
return 0
by_type: dict[str, int] = {}
by_source: dict[str, int] = {}
ages: list[int] = []
updates = 0
for p in pending:
s = page_summary(p)
by_type[s["type"]] = by_type.get(s["type"], 0) + 1
by_source[s["staged_by"]] = by_source.get(s["staged_by"], 0) + 1
if s["age_days"] is not None:
ages.append(s["age_days"])
if s["modifies"]:
updates += 1
print(f"Total pending: {total}")
print(f"Updates (modifies existing): {updates}")
print(f"New pages: {total - updates}")
print()
print("By type:")
for t, n in sorted(by_type.items()):
print(f" {t}: {n}")
print()
print("By source:")
for s, n in sorted(by_source.items()):
print(f" {s}: {n}")
if ages:
print()
print(f"Age (days): min={min(ages)}, max={max(ages)}, avg={sum(ages)//len(ages)}")
return 0
def cmd_promote(path_arg: str, dry_run: bool) -> int:
page = resolve_page(path_arg)
if not page:
return 1
result = promote(page, dry_run=dry_run)
if result and not dry_run:
print(f" [promoted] {result.relative_to(WIKI_DIR)}")
return 0
def cmd_reject(path_arg: str, reason: str, dry_run: bool) -> int:
page = resolve_page(path_arg)
if not page:
return 1
reject(page, reason, dry_run=dry_run)
return 0
def cmd_promote_all(dry_run: bool) -> int:
pending = list_pending()
if not pending:
print("No pending items.")
return 0
print(f"Promoting {len(pending)} page(s)...")
for p in pending:
promote(p, dry_run=dry_run)
return 0
def cmd_review() -> int:
"""Interactive review loop. Prompts approve/reject/skip for each pending item."""
pending = list_pending()
if not pending:
print("No pending items.")
return 0
print(f"Reviewing {len(pending)} pending item(s). (a)pprove / (r)eject / (s)kip / (q)uit\n")
for p in pending:
s = page_summary(p)
print(f"━━━ {s['path']} ━━━")
print(f" {s['title']} ({s['type']})")
print(f" from: {s['staged_by']} ({s['staged_date']})")
print(f" target: {s['target_path']}")
if s["modifies"]:
print(f" updates: {s['modifies']}")
if s["compilation_notes"]:
print(f" notes: {s['compilation_notes'][:150]}")
# Show first few lines of body
first_lines = [ln for ln in p.body.strip().splitlines() if ln.strip()][:3]
for ln in first_lines:
print(f"{ln[:100]}")
print()
while True:
try:
answer = input(" [a/r/s/q] > ").strip().lower()
except EOFError:
return 0
if answer in ("a", "approve"):
promote(p)
break
if answer in ("r", "reject"):
try:
reason = input(" reason > ").strip()
except EOFError:
return 0
reject(p, reason or "no reason given")
break
if answer in ("s", "skip"):
break
if answer in ("q", "quit"):
return 0
print()
return 0
def cmd_sync() -> int:
"""Reconcile staging index after manual operations (Obsidian moves, deletions).
Also detects pages that were manually moved out of staging without going through
the promotion flow and reports them.
"""
print("Regenerating staging index...")
regenerate_staging_index()
# Detect pages in live directories with status: pending (manual promotion without cleanup)
leaked: list[Path] = []
for page in iter_live_pages():
if str(page.frontmatter.get("status", "")) == "pending":
leaked.append(page.path)
if leaked:
print("\n[warn] live pages still marked status: pending — fix manually:")
for p in leaked:
print(f" {p.relative_to(WIKI_DIR)}")
pending = list_pending()
print(f"\n{len(pending)} pending item(s) in staging.")
return 0
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> int:
parser = argparse.ArgumentParser(description="Wiki staging pipeline")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--list", action="store_true", help="List pending items")
group.add_argument("--stats", action="store_true", help="Summary stats")
group.add_argument("--promote", metavar="PATH", help="Approve a pending page")
group.add_argument("--reject", metavar="PATH", help="Reject a pending page")
group.add_argument("--promote-all", action="store_true", help="Promote every pending page")
group.add_argument("--review", action="store_true", help="Interactive approval loop")
group.add_argument("--sync", action="store_true", help="Regenerate staging index & detect drift")
parser.add_argument("--json", action="store_true", help="JSON output for --list")
parser.add_argument("--reason", default="", help="Rejection reason for --reject")
parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
args = parser.parse_args()
STAGING_DIR.mkdir(parents=True, exist_ok=True)
if args.list:
return cmd_list(as_json=args.json)
if args.stats:
return cmd_stats()
if args.promote:
return cmd_promote(args.promote, args.dry_run)
if args.reject:
if not args.reason:
print("--reject requires --reason", file=sys.stderr)
return 2
return cmd_reject(args.reject, args.reason, args.dry_run)
if args.promote_all:
return cmd_promote_all(args.dry_run)
if args.review:
return cmd_review()
if args.sync:
return cmd_sync()
return 0
if __name__ == "__main__":
sys.exit(main())