memex/scripts/wiki-distill.py

#!/usr/bin/env python3
"""Distill wiki pages from summarized conversation content.

This is the "closing the MemPalace loop" step: closet summaries become
the source material for new or updated wiki pages. It's parallel to
wiki-harvest.py (which compiles URL content into wiki pages) but operates
on the *content of the conversations themselves* rather than the URLs
they cite.

Scope filter (deliberately narrow):

  1. Find all summarized conversations dated TODAY
  2. Extract their `topics:` — this is the "topics-of-today" set
  3. For each topic in that set, pull ALL summarized conversations across
     history that share that topic (rollup for full context)
  4. For each topic group, extract `hall_facts` + `hall_discoveries` +
     `hall_advice` bullet content from the body
  5. Send the topic group + relevant hall entries to `claude -p` with
     the current index.md, ask for new_page / update_page / both / skip
  6. Write result(s) to staging/<type>/ with `staged_by: wiki-distill`

First run bootstrap (--first-run or empty state):

  - Instead of "topics-of-today", use "topics-from-the-last-7-days"
  - This seeds the state file so subsequent runs can stay narrow

Self-triggering property:

  - Old dormant topics that resurface in a new conversation will
    automatically pull in all historical conversations on that topic
    via the rollup — no need to manually trigger reprocessing

State: `.distill-state.json` tracks processed conversations (path +
content hash + topics seen at distill time). A conversation is
re-processed if its content hash changes OR it has a new topic not
seen during the previous distill.

Usage:
    python3 scripts/wiki-distill.py                  # Today-only rollup
    python3 scripts/wiki-distill.py --first-run      # Last 7 days rollup
    python3 scripts/wiki-distill.py --topic TOPIC    # Process one topic explicitly
    python3 scripts/wiki-distill.py --project mc     # Only this wing's today topics
    python3 scripts/wiki-distill.py --dry-run        # Plan only, no LLM, no writes
    python3 scripts/wiki-distill.py --no-compile     # Parse/rollup only, skip claude -p
    python3 scripts/wiki-distill.py --limit N        # Cap at N topic groups processed
"""

from __future__ import annotations

import argparse
import hashlib
import json
import os
import re
import subprocess
import sys
import time
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta, timezone
from pathlib import Path
from typing import Any

sys.path.insert(0, str(Path(__file__).parent))
from wiki_lib import (  # noqa: E402
    CONVERSATIONS_DIR,
    INDEX_FILE,
    STAGING_DIR,
    WIKI_DIR,
    WikiPage,
    high_signal_halls,
    parse_date,
    parse_page,
    today,
)

sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

DISTILL_STATE_FILE = WIKI_DIR / ".distill-state.json"

CLAUDE_HAIKU_MODEL = "haiku"
CLAUDE_SONNET_MODEL = "sonnet"
# Content size (characters) above which we route to sonnet
SONNET_CONTENT_THRESHOLD = 15_000
CLAUDE_TIMEOUT = 600

FIRST_RUN_LOOKBACK_DAYS = 7

# Minimum number of total hall bullets across the topic group to bother
# asking the LLM. A topic with only one fact/discovery across history is
# usually not enough signal to warrant a wiki page.
MIN_BULLETS_PER_TOPIC = 2


# ---------------------------------------------------------------------------
# State management
# ---------------------------------------------------------------------------


def load_state() -> dict[str, Any]:
    defaults: dict[str, Any] = {
        "processed_convs": {},
        "processed_topics": {},
        "rejected_topics": {},
        "last_run": None,
        "first_run_complete": False,
    }
    if DISTILL_STATE_FILE.exists():
        try:
            with open(DISTILL_STATE_FILE) as f:
                state = json.load(f)
            for k, v in defaults.items():
                state.setdefault(k, v)
            return state
        except (OSError, json.JSONDecodeError):
            pass
    return defaults


def save_state(state: dict[str, Any]) -> None:
    state["last_run"] = datetime.now(timezone.utc).isoformat()
    tmp = DISTILL_STATE_FILE.with_suffix(".json.tmp")
    with open(tmp, "w") as f:
        json.dump(state, f, indent=2, sort_keys=True)
    tmp.replace(DISTILL_STATE_FILE)


def conv_content_hash(conv: WikiPage) -> str:
    return "sha256:" + hashlib.sha256(conv.body.encode("utf-8")).hexdigest()


def conv_needs_distill(state: dict[str, Any], conv: WikiPage) -> bool:
    """Return True if this conversation should be re-processed."""
    rel = str(conv.path.relative_to(WIKI_DIR))
    entry = state.get("processed_convs", {}).get(rel)
    if not entry:
        return True
    if entry.get("content_hash") != conv_content_hash(conv):
        return True
    # New topics that weren't seen at distill time → re-process
    seen_topics = set(entry.get("topics_at_distill", []))
    current_topics = set(conv.frontmatter.get("topics") or [])
    if current_topics - seen_topics:
        return True
    return False


def mark_conv_distilled(
    state: dict[str, Any],
    conv: WikiPage,
    output_pages: list[str],
) -> None:
    rel = str(conv.path.relative_to(WIKI_DIR))
    state.setdefault("processed_convs", {})[rel] = {
        "distilled_date": today().isoformat(),
        "content_hash": conv_content_hash(conv),
        "topics_at_distill": list(conv.frontmatter.get("topics") or []),
        "output_pages": output_pages,
    }


# ---------------------------------------------------------------------------
# Conversation discovery & topic rollup
# ---------------------------------------------------------------------------


def iter_summarized_conversations(project_filter: str | None = None) -> list[WikiPage]:
    """Walk conversations/ and return all summarized conversation pages."""
    if not CONVERSATIONS_DIR.exists():
        return []
    results: list[WikiPage] = []
    for project_dir in sorted(CONVERSATIONS_DIR.iterdir()):
        if not project_dir.is_dir():
            continue
        if project_filter and project_dir.name != project_filter:
            continue
        for md in sorted(project_dir.glob("*.md")):
            page = parse_page(md)
            if not page:
                continue
            if page.frontmatter.get("status") != "summarized":
                continue
            results.append(page)
    return results


def extract_topics_from_today(
    conversations: list[WikiPage],
    target_date: date,
    lookback_days: int = 0,
) -> set[str]:
    """Find the set of topics appearing in conversations dated ≥ (target - lookback).

    lookback_days=0 → only today
    lookback_days=7 → today and the previous 7 days
    """
    cutoff = target_date - timedelta(days=lookback_days)
    topics: set[str] = set()
    for conv in conversations:
        d = parse_date(conv.frontmatter.get("date"))
        if d and d >= cutoff:
            for t in conv.frontmatter.get("topics") or []:
                t_clean = str(t).strip()
                if t_clean:
                    topics.add(t_clean)
    return topics


def rollup_conversations_by_topic(
    topic: str, conversations: list[WikiPage]
) -> list[WikiPage]:
    """Return all conversations (across all time) whose topics: list contains `topic`."""
    results: list[WikiPage] = []
    for conv in conversations:
        conv_topics = conv.frontmatter.get("topics") or []
        if topic in conv_topics:
            results.append(conv)
    # Most recent first so the LLM sees the current state before the backstory
    results.sort(
        key=lambda c: parse_date(c.frontmatter.get("date")) or date.min,
        reverse=True,
    )
    return results


# ---------------------------------------------------------------------------
# Build the LLM input for a topic group
# ---------------------------------------------------------------------------


@dataclass
class TopicGroup:
    topic: str
    conversations: list[WikiPage]
    halls_by_conv: list[dict[str, list[str]]]
    total_bullets: int


def build_topic_group(topic: str, conversations: list[WikiPage]) -> TopicGroup:
    halls_by_conv: list[dict[str, list[str]]] = []
    total = 0
    for conv in conversations:
        halls = high_signal_halls(conv)
        halls_by_conv.append(halls)
        total += sum(len(v) for v in halls.values())
    return TopicGroup(
        topic=topic,
        conversations=conversations,
        halls_by_conv=halls_by_conv,
        total_bullets=total,
    )


def format_topic_group_for_llm(group: TopicGroup) -> str:
    """Render a topic group as a prompt-friendly markdown block."""
    lines = [f"# Topic: {group.topic}", ""]
    lines.append(
        f"Found {len(group.conversations)} summarized conversation(s) tagged "
        f"with this topic, containing {group.total_bullets} high-signal bullets "
        f"across fact/discovery/advice halls."
    )
    lines.append("")
    for conv, halls in zip(group.conversations, group.halls_by_conv):
        rel = str(conv.path.relative_to(WIKI_DIR))
        date_str = conv.frontmatter.get("date", "unknown")
        title = conv.frontmatter.get("title", conv.path.stem)
        project = conv.frontmatter.get("project", "?")
        lines.append(f"## {date_str} — {title} ({project})")
        lines.append(f"_Source: `{rel}`_")
        lines.append("")
        for hall_type in ("fact", "discovery", "advice"):
            bullets = halls.get(hall_type) or []
            if not bullets:
                continue
            label = {"fact": "Decisions", "discovery": "Discoveries", "advice": "Advice"}[hall_type]
            lines.append(f"**{label}:**")
            for b in bullets:
                lines.append(f"- {b}")
            lines.append("")
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Claude compilation
# ---------------------------------------------------------------------------


DISTILL_PROMPT_TEMPLATE = """You are distilling wiki pages from summarized conversation content.

The wiki schema and conventions are defined in CLAUDE.md. The wiki has four
content directories: patterns/ (HOW), decisions/ (WHY), environments/ (WHERE),
concepts/ (WHAT). All pages require YAML frontmatter with title, type,
confidence, origin, sources, related, last_compiled, last_verified.

IMPORTANT: Do NOT include `status`, `staged_*`, `target_path`, `modifies`,
or `compilation_notes` fields in your page frontmatter — the distill script
injects those automatically.

Your task: given a topic group (all conversations across history that share
a topic, with their decisions/discoveries/advice), decide what wiki pages
should be created or updated. Emit a single JSON object with an `actions`
array. Each action is one of:

  - "new_page"    — create a new wiki page from the distilled knowledge
  - "update_page" — update an existing live wiki page (add content, merge)
  - "skip"        — content isn't substantive enough for a wiki page
                    OR the topic is already well-covered elsewhere

Schema:

{{
  "rationale": "1-2 sentences explaining your decision",
  "actions": [
    {{
      "type": "new_page",
      "directory": "patterns" | "decisions" | "environments" | "concepts",
      "filename": "kebab-case-name.md",
      "content": "full markdown including frontmatter"
    }},
    {{
      "type": "update_page",
      "path": "patterns/existing-page.md",
      "content": "full updated markdown including frontmatter (merged)"
    }},
    {{
      "type": "skip",
      "reason": "why this topic doesn't need a wiki page"
    }}
  ]
}}

You can emit MULTIPLE actions — e.g. a new_page for a concept and an
update_page to an existing pattern that now has new context.

Emit ONLY the JSON object. No prose, no markdown fences.

--- WIKI INDEX (existing pages) ---

{wiki_index}

--- TOPIC GROUP ---

{topic_group}
"""


def call_claude_distill(prompt: str, model: str) -> dict[str, Any] | None:
    try:
        result = subprocess.run(
            ["claude", "-p", "--model", model, "--output-format", "text", prompt],
            capture_output=True,
            text=True,
            timeout=CLAUDE_TIMEOUT,
        )
    except FileNotFoundError:
        print("  [warn] claude CLI not found — skipping compilation", file=sys.stderr)
        return None
    except subprocess.TimeoutExpired:
        print("  [warn] claude -p timed out", file=sys.stderr)
        return None
    if result.returncode != 0:
        print(f"  [warn] claude -p failed: {result.stderr.strip()[:200]}", file=sys.stderr)
        return None

    output = result.stdout.strip()
    match = re.search(r"\{.*\}", output, re.DOTALL)
    if not match:
        print(f"  [warn] no JSON found in claude output ({len(output)} chars)", file=sys.stderr)
        return None
    try:
        return json.loads(match.group(0))
    except json.JSONDecodeError as e:
        print(f"  [warn] JSON parse failed: {e}", file=sys.stderr)
        return None


# ---------------------------------------------------------------------------
# Staging output
# ---------------------------------------------------------------------------


STAGING_INJECT_TEMPLATE = (
    "---\n"
    "origin: automated\n"
    "status: pending\n"
    "staged_date: {staged_date}\n"
    "staged_by: wiki-distill\n"
    "target_path: {target_path}\n"
    "{modifies_line}"
    "distill_topic: {topic}\n"
    "distill_source_conversations: {source_convs}\n"
    "compilation_notes: {compilation_notes}\n"
)


def _inject_staging_frontmatter(
    content: str,
    target_path: str,
    topic: str,
    source_convs: list[str],
    compilation_notes: str,
    modifies: str | None,
) -> str:
    content = re.sub(
        r"^(status|origin|staged_\w+|target_path|modifies|distill_\w+|compilation_notes):.*\n",
        "",
        content,
        flags=re.MULTILINE,
    )

    modifies_line = f"modifies: {modifies}\n" if modifies else ""
    clean_notes = compilation_notes.replace("\n", " ").replace("\r", " ").strip()
    sources_yaml = ",".join(source_convs)
    injection = STAGING_INJECT_TEMPLATE.format(
        staged_date=datetime.now(timezone.utc).date().isoformat(),
        target_path=target_path,
        modifies_line=modifies_line,
        topic=topic,
        source_convs=sources_yaml,
        compilation_notes=clean_notes or "(distilled from conversation topic group)",
    )

    if content.startswith("---\n"):
        return injection + content[4:]
    return injection + "---\n" + content


def _unique_staging_path(base: Path) -> Path:
    if not base.exists():
        return base
    suffix = hashlib.sha256(str(base).encode() + str(time.time()).encode()).hexdigest()[:6]
    return base.with_stem(f"{base.stem}-{suffix}")


def apply_distill_actions(
    result: dict[str, Any],
    topic: str,
    source_convs: list[str],
    dry_run: bool,
) -> list[Path]:
    written: list[Path] = []
    actions = result.get("actions") or []
    rationale = result.get("rationale", "")

    for action in actions:
        action_type = action.get("type")
        if action_type == "skip":
            reason = action.get("reason", "not substantive enough")
            print(f"    [skip] topic={topic!r}: {reason}")
            continue

        if action_type == "new_page":
            directory = action.get("directory") or "patterns"
            filename = action.get("filename")
            content = action.get("content")
            if not filename or not content:
                print(f"    [warn] incomplete new_page action for topic={topic!r}", file=sys.stderr)
                continue
            target_rel = f"{directory}/{filename}"
            dest = _unique_staging_path(STAGING_DIR / target_rel)
            if dry_run:
                print(f"    [dry-run] new_page → {dest.relative_to(WIKI_DIR)}")
                continue
            dest.parent.mkdir(parents=True, exist_ok=True)
            injected = _inject_staging_frontmatter(
                content,
                target_path=target_rel,
                topic=topic,
                source_convs=source_convs,
                compilation_notes=rationale,
                modifies=None,
            )
            dest.write_text(injected)
            written.append(dest)
            print(f"    [new]  {dest.relative_to(WIKI_DIR)}")
            continue

        if action_type == "update_page":
            target_rel = action.get("path")
            content = action.get("content")
            if not target_rel or not content:
                print(f"    [warn] incomplete update_page action for topic={topic!r}", file=sys.stderr)
                continue
            dest = _unique_staging_path(STAGING_DIR / target_rel)
            if dry_run:
                print(f"    [dry-run] update_page → {dest.relative_to(WIKI_DIR)} (modifies {target_rel})")
                continue
            dest.parent.mkdir(parents=True, exist_ok=True)
            injected = _inject_staging_frontmatter(
                content,
                target_path=target_rel,
                topic=topic,
                source_convs=source_convs,
                compilation_notes=rationale,
                modifies=target_rel,
            )
            dest.write_text(injected)
            written.append(dest)
            print(f"    [upd]  {dest.relative_to(WIKI_DIR)} (modifies {target_rel})")
            continue

        print(f"    [warn] unknown action type: {action_type!r}", file=sys.stderr)

    return written


# ---------------------------------------------------------------------------
# Main pipeline
# ---------------------------------------------------------------------------


def pick_model(topic_group: TopicGroup, prompt: str) -> str:
    if len(prompt) > SONNET_CONTENT_THRESHOLD or topic_group.total_bullets > 20:
        return CLAUDE_SONNET_MODEL
    return CLAUDE_HAIKU_MODEL


def process_topic(
    topic: str,
    conversations: list[WikiPage],
    state: dict[str, Any],
    dry_run: bool,
    compile_enabled: bool,
) -> tuple[str, list[Path]]:
    """Process a single topic group. Returns (status, written_paths)."""

    group = build_topic_group(topic, conversations)

    if group.total_bullets < MIN_BULLETS_PER_TOPIC:
        return f"too-thin (only {group.total_bullets} bullets)", []

    if topic in state.get("rejected_topics", {}):
        return "previously-rejected", []

    wiki_index_text = ""
    try:
        wiki_index_text = INDEX_FILE.read_text()[:15_000]
    except OSError:
        pass

    topic_group_text = format_topic_group_for_llm(group)
    prompt = DISTILL_PROMPT_TEMPLATE.format(
        wiki_index=wiki_index_text,
        topic_group=topic_group_text,
    )

    if dry_run:
        model = pick_model(group, prompt)
        return (
            f"would-distill ({len(group.conversations)} convs, "
            f"{group.total_bullets} bullets, {model})"
        ), []

    if not compile_enabled:
        return (
            f"skipped-compile ({len(group.conversations)} convs, "
            f"{group.total_bullets} bullets)"
        ), []

    model = pick_model(group, prompt)
    print(f"  [compile] topic={topic!r} "
          f"convs={len(group.conversations)} bullets={group.total_bullets} model={model}")

    result = call_claude_distill(prompt, model)
    if result is None:
        return "compile-failed", []

    actions = result.get("actions") or []
    if not actions or all(a.get("type") == "skip" for a in actions):
        reason = result.get("rationale", "AI chose to skip")
        state.setdefault("rejected_topics", {})[topic] = {
            "reason": reason,
            "rejected_date": today().isoformat(),
        }
        return "rejected-by-llm", []

    source_convs = [str(c.path.relative_to(WIKI_DIR)) for c in group.conversations]
    written = apply_distill_actions(result, topic, source_convs, dry_run=False)

    for conv in group.conversations:
        mark_conv_distilled(state, conv, [str(p.relative_to(WIKI_DIR)) for p in written])

    state.setdefault("processed_topics", {})[topic] = {
        "distilled_date": today().isoformat(),
        "conversations": source_convs,
        "output_pages": [str(p.relative_to(WIKI_DIR)) for p in written],
    }

    return f"distilled ({len(written)} page(s))", written


def run(
    *,
    first_run: bool,
    explicit_topic: str | None,
    project_filter: str | None,
    dry_run: bool,
    compile_enabled: bool,
    limit: int,
) -> int:
    state = load_state()
    if not state.get("first_run_complete"):
        first_run = True

    all_convs = iter_summarized_conversations(project_filter)
    print(f"Scanning {len(all_convs)} summarized conversation(s)...")

    # Figure out which topics to process
    if explicit_topic:
        topics_to_process: set[str] = {explicit_topic}
        print(f"Explicit topic mode: {explicit_topic!r}")
    else:
        lookback = FIRST_RUN_LOOKBACK_DAYS if first_run else 0
        topics_to_process = extract_topics_from_today(all_convs, today(), lookback)
        if first_run:
            print(f"First-run bootstrap: last {FIRST_RUN_LOOKBACK_DAYS} days → "
                  f"{len(topics_to_process)} topic(s)")
        else:
            print(f"Today-only mode: {len(topics_to_process)} topic(s) from today's conversations")

    if not topics_to_process:
        print("No topics to distill.")
        if first_run:
            state["first_run_complete"] = True
            save_state(state)
        return 0

    # Sort for deterministic ordering
    topics_ordered = sorted(topics_to_process)

    stats: dict[str, int] = {}
    processed = 0
    total_written: list[Path] = []

    for topic in topics_ordered:
        convs = rollup_conversations_by_topic(topic, all_convs)
        if not convs:
            stats["no-matches"] = stats.get("no-matches", 0) + 1
            continue

        print(f"\n[{topic}] rollup: {len(convs)} conversation(s)")
        status, written = process_topic(
            topic, convs, state, dry_run=dry_run, compile_enabled=compile_enabled
        )
        stats[status.split(" ")[0]] = stats.get(status.split(" ")[0], 0) + 1
        print(f"  [{status}]")

        total_written.extend(written)
        if not dry_run:
            processed += 1
            save_state(state)

        if limit and processed >= limit:
            print(f"\nLimit reached ({limit}); stopping.")
            break

    if first_run and not dry_run:
        state["first_run_complete"] = True
    if not dry_run:
        save_state(state)

    print("\nSummary:")
    for status, count in sorted(stats.items()):
        print(f"  {status}: {count}")
    print(f"\n{len(total_written)} staging page(s) written")
    return 0


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0])
    parser.add_argument("--first-run", action="store_true",
                        help="Bootstrap with last 7 days instead of today-only")
    parser.add_argument("--topic", default=None,
                        help="Process one specific topic explicitly")
    parser.add_argument("--project", default=None,
                        help="Only consider conversations under this wing")
    parser.add_argument("--dry-run", action="store_true",
                        help="Plan only; no LLM calls, no writes")
    parser.add_argument("--no-compile", action="store_true",
                        help="Parse + rollup only; skip claude -p step")
    parser.add_argument("--limit", type=int, default=0,
                        help="Stop after N topic groups processed (0 = unlimited)")
    args = parser.parse_args()

    return run(
        first_run=args.first_run,
        explicit_topic=args.topic,
        project_filter=args.project,
        dry_run=args.dry_run,
        compile_enabled=not args.no_compile,
        limit=args.limit,
    )


if __name__ == "__main__":
    sys.exit(main())