#!/usr/bin/env python3 """Distill wiki pages from summarized conversation content. This is the "closing the MemPalace loop" step: closet summaries become the source material for new or updated wiki pages. It's parallel to wiki-harvest.py (which compiles URL content into wiki pages) but operates on the *content of the conversations themselves* rather than the URLs they cite. Scope filter (deliberately narrow): 1. Find all summarized conversations dated TODAY 2. Extract their `topics:` — this is the "topics-of-today" set 3. For each topic in that set, pull ALL summarized conversations across history that share that topic (rollup for full context) 4. For each topic group, extract `hall_facts` + `hall_discoveries` + `hall_advice` bullet content from the body 5. Send the topic group + relevant hall entries to `claude -p` with the current index.md, ask for new_page / update_page / both / skip 6. Write result(s) to staging// with `staged_by: wiki-distill` First run bootstrap (--first-run or empty state): - Instead of "topics-of-today", use "topics-from-the-last-7-days" - This seeds the state file so subsequent runs can stay narrow Self-triggering property: - Old dormant topics that resurface in a new conversation will automatically pull in all historical conversations on that topic via the rollup — no need to manually trigger reprocessing State: `.distill-state.json` tracks processed conversations (path + content hash + topics seen at distill time). A conversation is re-processed if its content hash changes OR it has a new topic not seen during the previous distill. Usage: python3 scripts/wiki-distill.py # Today-only rollup python3 scripts/wiki-distill.py --first-run # Last 7 days rollup python3 scripts/wiki-distill.py --topic TOPIC # Process one topic explicitly python3 scripts/wiki-distill.py --project mc # Only this wing's today topics python3 scripts/wiki-distill.py --dry-run # Plan only, no LLM, no writes python3 scripts/wiki-distill.py --no-compile # Parse/rollup only, skip claude -p python3 scripts/wiki-distill.py --limit N # Cap at N topic groups processed """ from __future__ import annotations import argparse import hashlib import json import os import re import subprocess import sys import time from dataclasses import dataclass, field from datetime import date, datetime, timedelta, timezone from pathlib import Path from typing import Any sys.path.insert(0, str(Path(__file__).parent)) from wiki_lib import ( # noqa: E402 CONVERSATIONS_DIR, INDEX_FILE, STAGING_DIR, WIKI_DIR, WikiPage, high_signal_halls, parse_date, parse_page, today, ) sys.stdout.reconfigure(line_buffering=True) sys.stderr.reconfigure(line_buffering=True) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- DISTILL_STATE_FILE = WIKI_DIR / ".distill-state.json" CLAUDE_HAIKU_MODEL = "haiku" CLAUDE_SONNET_MODEL = "sonnet" # Content size (characters) above which we route to sonnet SONNET_CONTENT_THRESHOLD = 15_000 CLAUDE_TIMEOUT = 600 FIRST_RUN_LOOKBACK_DAYS = 7 # Minimum number of total hall bullets across the topic group to bother # asking the LLM. A topic with only one fact/discovery across history is # usually not enough signal to warrant a wiki page. MIN_BULLETS_PER_TOPIC = 2 # --------------------------------------------------------------------------- # State management # --------------------------------------------------------------------------- def load_state() -> dict[str, Any]: defaults: dict[str, Any] = { "processed_convs": {}, "processed_topics": {}, "rejected_topics": {}, "last_run": None, "first_run_complete": False, } if DISTILL_STATE_FILE.exists(): try: with open(DISTILL_STATE_FILE) as f: state = json.load(f) for k, v in defaults.items(): state.setdefault(k, v) return state except (OSError, json.JSONDecodeError): pass return defaults def save_state(state: dict[str, Any]) -> None: state["last_run"] = datetime.now(timezone.utc).isoformat() tmp = DISTILL_STATE_FILE.with_suffix(".json.tmp") with open(tmp, "w") as f: json.dump(state, f, indent=2, sort_keys=True) tmp.replace(DISTILL_STATE_FILE) def conv_content_hash(conv: WikiPage) -> str: return "sha256:" + hashlib.sha256(conv.body.encode("utf-8")).hexdigest() def conv_needs_distill(state: dict[str, Any], conv: WikiPage) -> bool: """Return True if this conversation should be re-processed.""" rel = str(conv.path.relative_to(WIKI_DIR)) entry = state.get("processed_convs", {}).get(rel) if not entry: return True if entry.get("content_hash") != conv_content_hash(conv): return True # New topics that weren't seen at distill time → re-process seen_topics = set(entry.get("topics_at_distill", [])) current_topics = set(conv.frontmatter.get("topics") or []) if current_topics - seen_topics: return True return False def mark_conv_distilled( state: dict[str, Any], conv: WikiPage, output_pages: list[str], ) -> None: rel = str(conv.path.relative_to(WIKI_DIR)) state.setdefault("processed_convs", {})[rel] = { "distilled_date": today().isoformat(), "content_hash": conv_content_hash(conv), "topics_at_distill": list(conv.frontmatter.get("topics") or []), "output_pages": output_pages, } # --------------------------------------------------------------------------- # Conversation discovery & topic rollup # --------------------------------------------------------------------------- def iter_summarized_conversations(project_filter: str | None = None) -> list[WikiPage]: """Walk conversations/ and return all summarized conversation pages.""" if not CONVERSATIONS_DIR.exists(): return [] results: list[WikiPage] = [] for project_dir in sorted(CONVERSATIONS_DIR.iterdir()): if not project_dir.is_dir(): continue if project_filter and project_dir.name != project_filter: continue for md in sorted(project_dir.glob("*.md")): page = parse_page(md) if not page: continue if page.frontmatter.get("status") != "summarized": continue results.append(page) return results def extract_topics_from_today( conversations: list[WikiPage], target_date: date, lookback_days: int = 0, ) -> set[str]: """Find the set of topics appearing in conversations dated ≥ (target - lookback). lookback_days=0 → only today lookback_days=7 → today and the previous 7 days """ cutoff = target_date - timedelta(days=lookback_days) topics: set[str] = set() for conv in conversations: d = parse_date(conv.frontmatter.get("date")) if d and d >= cutoff: for t in conv.frontmatter.get("topics") or []: t_clean = str(t).strip() if t_clean: topics.add(t_clean) return topics def rollup_conversations_by_topic( topic: str, conversations: list[WikiPage] ) -> list[WikiPage]: """Return all conversations (across all time) whose topics: list contains `topic`.""" results: list[WikiPage] = [] for conv in conversations: conv_topics = conv.frontmatter.get("topics") or [] if topic in conv_topics: results.append(conv) # Most recent first so the LLM sees the current state before the backstory results.sort( key=lambda c: parse_date(c.frontmatter.get("date")) or date.min, reverse=True, ) return results # --------------------------------------------------------------------------- # Build the LLM input for a topic group # --------------------------------------------------------------------------- @dataclass class TopicGroup: topic: str conversations: list[WikiPage] halls_by_conv: list[dict[str, list[str]]] total_bullets: int def build_topic_group(topic: str, conversations: list[WikiPage]) -> TopicGroup: halls_by_conv: list[dict[str, list[str]]] = [] total = 0 for conv in conversations: halls = high_signal_halls(conv) halls_by_conv.append(halls) total += sum(len(v) for v in halls.values()) return TopicGroup( topic=topic, conversations=conversations, halls_by_conv=halls_by_conv, total_bullets=total, ) def format_topic_group_for_llm(group: TopicGroup) -> str: """Render a topic group as a prompt-friendly markdown block.""" lines = [f"# Topic: {group.topic}", ""] lines.append( f"Found {len(group.conversations)} summarized conversation(s) tagged " f"with this topic, containing {group.total_bullets} high-signal bullets " f"across fact/discovery/advice halls." ) lines.append("") for conv, halls in zip(group.conversations, group.halls_by_conv): rel = str(conv.path.relative_to(WIKI_DIR)) date_str = conv.frontmatter.get("date", "unknown") title = conv.frontmatter.get("title", conv.path.stem) project = conv.frontmatter.get("project", "?") lines.append(f"## {date_str} — {title} ({project})") lines.append(f"_Source: `{rel}`_") lines.append("") for hall_type in ("fact", "discovery", "advice"): bullets = halls.get(hall_type) or [] if not bullets: continue label = {"fact": "Decisions", "discovery": "Discoveries", "advice": "Advice"}[hall_type] lines.append(f"**{label}:**") for b in bullets: lines.append(f"- {b}") lines.append("") return "\n".join(lines) # --------------------------------------------------------------------------- # Claude compilation # --------------------------------------------------------------------------- DISTILL_PROMPT_TEMPLATE = """You are distilling wiki pages from summarized conversation content. The wiki schema and conventions are defined in CLAUDE.md. The wiki has four content directories: patterns/ (HOW), decisions/ (WHY), environments/ (WHERE), concepts/ (WHAT). All pages require YAML frontmatter with title, type, confidence, origin, sources, related, last_compiled, last_verified. IMPORTANT: Do NOT include `status`, `staged_*`, `target_path`, `modifies`, or `compilation_notes` fields in your page frontmatter — the distill script injects those automatically. Your task: given a topic group (all conversations across history that share a topic, with their decisions/discoveries/advice), decide what wiki pages should be created or updated. Emit a single JSON object with an `actions` array. Each action is one of: - "new_page" — create a new wiki page from the distilled knowledge - "update_page" — update an existing live wiki page (add content, merge) - "skip" — content isn't substantive enough for a wiki page OR the topic is already well-covered elsewhere Schema: {{ "rationale": "1-2 sentences explaining your decision", "actions": [ {{ "type": "new_page", "directory": "patterns" | "decisions" | "environments" | "concepts", "filename": "kebab-case-name.md", "content": "full markdown including frontmatter" }}, {{ "type": "update_page", "path": "patterns/existing-page.md", "content": "full updated markdown including frontmatter (merged)" }}, {{ "type": "skip", "reason": "why this topic doesn't need a wiki page" }} ] }} You can emit MULTIPLE actions — e.g. a new_page for a concept and an update_page to an existing pattern that now has new context. Emit ONLY the JSON object. No prose, no markdown fences. --- WIKI INDEX (existing pages) --- {wiki_index} --- TOPIC GROUP --- {topic_group} """ def call_claude_distill(prompt: str, model: str) -> dict[str, Any] | None: try: result = subprocess.run( ["claude", "-p", "--model", model, "--output-format", "text", prompt], capture_output=True, text=True, timeout=CLAUDE_TIMEOUT, ) except FileNotFoundError: print(" [warn] claude CLI not found — skipping compilation", file=sys.stderr) return None except subprocess.TimeoutExpired: print(" [warn] claude -p timed out", file=sys.stderr) return None if result.returncode != 0: print(f" [warn] claude -p failed: {result.stderr.strip()[:200]}", file=sys.stderr) return None output = result.stdout.strip() match = re.search(r"\{.*\}", output, re.DOTALL) if not match: print(f" [warn] no JSON found in claude output ({len(output)} chars)", file=sys.stderr) return None try: return json.loads(match.group(0)) except json.JSONDecodeError as e: print(f" [warn] JSON parse failed: {e}", file=sys.stderr) return None # --------------------------------------------------------------------------- # Staging output # --------------------------------------------------------------------------- STAGING_INJECT_TEMPLATE = ( "---\n" "origin: automated\n" "status: pending\n" "staged_date: {staged_date}\n" "staged_by: wiki-distill\n" "target_path: {target_path}\n" "{modifies_line}" "distill_topic: {topic}\n" "distill_source_conversations: {source_convs}\n" "compilation_notes: {compilation_notes}\n" ) def _inject_staging_frontmatter( content: str, target_path: str, topic: str, source_convs: list[str], compilation_notes: str, modifies: str | None, ) -> str: content = re.sub( r"^(status|origin|staged_\w+|target_path|modifies|distill_\w+|compilation_notes):.*\n", "", content, flags=re.MULTILINE, ) modifies_line = f"modifies: {modifies}\n" if modifies else "" clean_notes = compilation_notes.replace("\n", " ").replace("\r", " ").strip() sources_yaml = ",".join(source_convs) injection = STAGING_INJECT_TEMPLATE.format( staged_date=datetime.now(timezone.utc).date().isoformat(), target_path=target_path, modifies_line=modifies_line, topic=topic, source_convs=sources_yaml, compilation_notes=clean_notes or "(distilled from conversation topic group)", ) if content.startswith("---\n"): return injection + content[4:] return injection + "---\n" + content def _unique_staging_path(base: Path) -> Path: if not base.exists(): return base suffix = hashlib.sha256(str(base).encode() + str(time.time()).encode()).hexdigest()[:6] return base.with_stem(f"{base.stem}-{suffix}") def apply_distill_actions( result: dict[str, Any], topic: str, source_convs: list[str], dry_run: bool, ) -> list[Path]: written: list[Path] = [] actions = result.get("actions") or [] rationale = result.get("rationale", "") for action in actions: action_type = action.get("type") if action_type == "skip": reason = action.get("reason", "not substantive enough") print(f" [skip] topic={topic!r}: {reason}") continue if action_type == "new_page": directory = action.get("directory") or "patterns" filename = action.get("filename") content = action.get("content") if not filename or not content: print(f" [warn] incomplete new_page action for topic={topic!r}", file=sys.stderr) continue target_rel = f"{directory}/{filename}" dest = _unique_staging_path(STAGING_DIR / target_rel) if dry_run: print(f" [dry-run] new_page → {dest.relative_to(WIKI_DIR)}") continue dest.parent.mkdir(parents=True, exist_ok=True) injected = _inject_staging_frontmatter( content, target_path=target_rel, topic=topic, source_convs=source_convs, compilation_notes=rationale, modifies=None, ) dest.write_text(injected) written.append(dest) print(f" [new] {dest.relative_to(WIKI_DIR)}") continue if action_type == "update_page": target_rel = action.get("path") content = action.get("content") if not target_rel or not content: print(f" [warn] incomplete update_page action for topic={topic!r}", file=sys.stderr) continue dest = _unique_staging_path(STAGING_DIR / target_rel) if dry_run: print(f" [dry-run] update_page → {dest.relative_to(WIKI_DIR)} (modifies {target_rel})") continue dest.parent.mkdir(parents=True, exist_ok=True) injected = _inject_staging_frontmatter( content, target_path=target_rel, topic=topic, source_convs=source_convs, compilation_notes=rationale, modifies=target_rel, ) dest.write_text(injected) written.append(dest) print(f" [upd] {dest.relative_to(WIKI_DIR)} (modifies {target_rel})") continue print(f" [warn] unknown action type: {action_type!r}", file=sys.stderr) return written # --------------------------------------------------------------------------- # Main pipeline # --------------------------------------------------------------------------- def pick_model(topic_group: TopicGroup, prompt: str) -> str: if len(prompt) > SONNET_CONTENT_THRESHOLD or topic_group.total_bullets > 20: return CLAUDE_SONNET_MODEL return CLAUDE_HAIKU_MODEL def process_topic( topic: str, conversations: list[WikiPage], state: dict[str, Any], dry_run: bool, compile_enabled: bool, ) -> tuple[str, list[Path]]: """Process a single topic group. Returns (status, written_paths).""" group = build_topic_group(topic, conversations) if group.total_bullets < MIN_BULLETS_PER_TOPIC: return f"too-thin (only {group.total_bullets} bullets)", [] if topic in state.get("rejected_topics", {}): return "previously-rejected", [] wiki_index_text = "" try: wiki_index_text = INDEX_FILE.read_text()[:15_000] except OSError: pass topic_group_text = format_topic_group_for_llm(group) prompt = DISTILL_PROMPT_TEMPLATE.format( wiki_index=wiki_index_text, topic_group=topic_group_text, ) if dry_run: model = pick_model(group, prompt) return ( f"would-distill ({len(group.conversations)} convs, " f"{group.total_bullets} bullets, {model})" ), [] if not compile_enabled: return ( f"skipped-compile ({len(group.conversations)} convs, " f"{group.total_bullets} bullets)" ), [] model = pick_model(group, prompt) print(f" [compile] topic={topic!r} " f"convs={len(group.conversations)} bullets={group.total_bullets} model={model}") result = call_claude_distill(prompt, model) if result is None: return "compile-failed", [] actions = result.get("actions") or [] if not actions or all(a.get("type") == "skip" for a in actions): reason = result.get("rationale", "AI chose to skip") state.setdefault("rejected_topics", {})[topic] = { "reason": reason, "rejected_date": today().isoformat(), } return "rejected-by-llm", [] source_convs = [str(c.path.relative_to(WIKI_DIR)) for c in group.conversations] written = apply_distill_actions(result, topic, source_convs, dry_run=False) for conv in group.conversations: mark_conv_distilled(state, conv, [str(p.relative_to(WIKI_DIR)) for p in written]) state.setdefault("processed_topics", {})[topic] = { "distilled_date": today().isoformat(), "conversations": source_convs, "output_pages": [str(p.relative_to(WIKI_DIR)) for p in written], } return f"distilled ({len(written)} page(s))", written def run( *, first_run: bool, explicit_topic: str | None, project_filter: str | None, dry_run: bool, compile_enabled: bool, limit: int, ) -> int: state = load_state() if not state.get("first_run_complete"): first_run = True all_convs = iter_summarized_conversations(project_filter) print(f"Scanning {len(all_convs)} summarized conversation(s)...") # Figure out which topics to process if explicit_topic: topics_to_process: set[str] = {explicit_topic} print(f"Explicit topic mode: {explicit_topic!r}") else: lookback = FIRST_RUN_LOOKBACK_DAYS if first_run else 0 topics_to_process = extract_topics_from_today(all_convs, today(), lookback) if first_run: print(f"First-run bootstrap: last {FIRST_RUN_LOOKBACK_DAYS} days → " f"{len(topics_to_process)} topic(s)") else: print(f"Today-only mode: {len(topics_to_process)} topic(s) from today's conversations") if not topics_to_process: print("No topics to distill.") if first_run: state["first_run_complete"] = True save_state(state) return 0 # Sort for deterministic ordering topics_ordered = sorted(topics_to_process) stats: dict[str, int] = {} processed = 0 total_written: list[Path] = [] for topic in topics_ordered: convs = rollup_conversations_by_topic(topic, all_convs) if not convs: stats["no-matches"] = stats.get("no-matches", 0) + 1 continue print(f"\n[{topic}] rollup: {len(convs)} conversation(s)") status, written = process_topic( topic, convs, state, dry_run=dry_run, compile_enabled=compile_enabled ) stats[status.split(" ")[0]] = stats.get(status.split(" ")[0], 0) + 1 print(f" [{status}]") total_written.extend(written) if not dry_run: processed += 1 save_state(state) if limit and processed >= limit: print(f"\nLimit reached ({limit}); stopping.") break if first_run and not dry_run: state["first_run_complete"] = True if not dry_run: save_state(state) print("\nSummary:") for status, count in sorted(stats.items()): print(f" {status}: {count}") print(f"\n{len(total_written)} staging page(s) written") return 0 def main() -> int: parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0]) parser.add_argument("--first-run", action="store_true", help="Bootstrap with last 7 days instead of today-only") parser.add_argument("--topic", default=None, help="Process one specific topic explicitly") parser.add_argument("--project", default=None, help="Only consider conversations under this wing") parser.add_argument("--dry-run", action="store_true", help="Plan only; no LLM calls, no writes") parser.add_argument("--no-compile", action="store_true", help="Parse + rollup only; skip claude -p step") parser.add_argument("--limit", type=int, default=0, help="Stop after N topic groups processed (0 = unlimited)") args = parser.parse_args() return run( first_run=args.first_run, explicit_topic=args.topic, project_filter=args.project, dry_run=args.dry_run, compile_enabled=not args.no_compile, limit=args.limit, ) if __name__ == "__main__": sys.exit(main())