Initial commit — memex

A compounding LLM-maintained knowledge wiki. Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's mempalace, with an automation layer on top for conversation mining, URL harvesting, human-in-the-loop staging, staleness decay, and hygiene. Includes: - 11 pipeline scripts (extract, summarize, index, harvest, stage, hygiene, maintain, sync, + shared library) - Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE - Example CLAUDE.md files (wiki schema + global instructions) tuned for the three-collection qmd setup - 171-test pytest suite (cross-platform, runs in ~1.3s) - MIT licensed
2026-04-12 21:16:02 -06:00
commit ee54a2f5d4
31 changed files with 10792 additions and 0 deletions
--- a/scripts/update-conversation-index.py
+++ b/scripts/update-conversation-index.py
@@ -0,0 +1,476 @@
+#!/usr/bin/env python3
+"""Update conversation index and context files from summarized conversations.
+
+Phase C of the conversation mining pipeline. Reads all conversation markdown
+files and regenerates:
+  - conversations/index.md — catalog organized by project
+  - context/wake-up.md — world briefing from recent conversations
+  - context/active-concerns.md — current blockers and open threads
+
+Usage:
+    python3 update-conversation-index.py
+    python3 update-conversation-index.py --reindex    # Also triggers qmd update
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki")))
+CONVERSATIONS_DIR = WIKI_DIR / "conversations"
+CONTEXT_DIR = WIKI_DIR / "context"
+INDEX_FILE = CONVERSATIONS_DIR / "index.md"
+WAKEUP_FILE = CONTEXT_DIR / "wake-up.md"
+CONCERNS_FILE = CONTEXT_DIR / "active-concerns.md"
+
+# ════════════════════════════════════════════════════════════════════════════
+# CONFIGURE ME — Project code to display name mapping
+# ════════════════════════════════════════════════════════════════════════════
+#
+# Every project code you use in `extract-sessions.py`'s PROJECT_MAP should
+# have a display name here. The conversation index groups conversations by
+# these codes and renders them under sections named by the display name.
+#
+# Examples — replace with your own:
+PROJECT_NAMES: dict[str, str] = {
+    "wiki": "WIKI — This Wiki",
+    "cl": "CL — Claude Config",
+    # "web": "WEB — My Webapp",
+    # "mob": "MOB — My Mobile App",
+    # "work": "WORK — Day Job",
+    "general": "General — Cross-Project",
+}
+
+# Order for display — put your most-active projects first
+PROJECT_ORDER = [
+    # "work", "web", "mob",
+    "wiki", "cl", "general",
+]
+
+
+# ---------------------------------------------------------------------------
+# Frontmatter parsing
+# ---------------------------------------------------------------------------
+
+
+def parse_frontmatter(file_path: Path) -> dict[str, str]:
+    """Parse YAML frontmatter from a markdown file."""
+    fm: dict[str, str] = {}
+    content = file_path.read_text()
+
+    # Find frontmatter between --- markers
+    match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
+    if not match:
+        return fm
+
+    for line in match.group(1).splitlines():
+        if ":" in line:
+            key, _, value = line.partition(":")
+            fm[key.strip()] = value.strip()
+
+    return fm
+
+
+def get_summary_line(file_path: Path) -> str:
+    """Extract the first sentence of the Summary section."""
+    content = file_path.read_text()
+    match = re.search(r"## Summary\n\n(.+?)(?:\n\n|\n##)", content, re.DOTALL)
+    if match:
+        summary = match.group(1).strip()
+        # First sentence
+        first_sentence = summary.split(". ")[0]
+        if not first_sentence.endswith("."):
+            first_sentence += "."
+        # Truncate if too long
+        if len(first_sentence) > 120:
+            first_sentence = first_sentence[:117] + "..."
+        return first_sentence
+    return "No summary available."
+
+
+def get_decisions(file_path: Path) -> list[str]:
+    """Extract decisions from a conversation file."""
+    content = file_path.read_text()
+    decisions: list[str] = []
+    match = re.search(r"## Decisions.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL)
+    if match:
+        for line in match.group(1).strip().splitlines():
+            line = line.strip()
+            if line.startswith("- "):
+                decisions.append(line[2:])
+    return decisions
+
+
+def get_discoveries(file_path: Path) -> list[str]:
+    """Extract discoveries from a conversation file."""
+    content = file_path.read_text()
+    discoveries: list[str] = []
+    match = re.search(r"## Discoveries.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL)
+    if match:
+        for line in match.group(1).strip().splitlines():
+            line = line.strip()
+            if line.startswith("- "):
+                discoveries.append(line[2:])
+    return discoveries
+
+
+# ---------------------------------------------------------------------------
+# Conversation discovery
+# ---------------------------------------------------------------------------
+
+
+def discover_conversations() -> dict[str, list[dict[str, Any]]]:
+    """Discover all conversation files organized by project."""
+    by_project: dict[str, list[dict[str, Any]]] = defaultdict(list)
+
+    for project_dir in sorted(CONVERSATIONS_DIR.iterdir()):
+        if not project_dir.is_dir():
+            continue
+
+        project_code = project_dir.name
+        if project_code not in PROJECT_NAMES:
+            continue
+
+        for md_file in sorted(project_dir.glob("*.md"), reverse=True):
+            if md_file.name == ".gitkeep":
+                continue
+
+            fm = parse_frontmatter(md_file)
+            status = fm.get("status", "extracted")
+
+            entry = {
+                "file": md_file,
+                "relative": md_file.relative_to(CONVERSATIONS_DIR),
+                "title": fm.get("title", md_file.stem),
+                "date": fm.get("date", "unknown"),
+                "status": status,
+                "messages": fm.get("messages", "0"),
+                "halls": fm.get("halls", ""),
+                "topics": fm.get("topics", ""),
+                "project": project_code,
+            }
+
+            by_project[project_code].append(entry)
+
+    return by_project
+
+
+# ---------------------------------------------------------------------------
+# Index generation
+# ---------------------------------------------------------------------------
+
+
+def generate_index(by_project: dict[str, list[dict[str, Any]]]) -> str:
+    """Generate the conversations/index.md content."""
+    total = sum(len(convos) for convos in by_project.values())
+    summarized = sum(
+        1
+        for convos in by_project.values()
+        for c in convos
+        if c["status"] == "summarized"
+    )
+    trivial = sum(
+        1
+        for convos in by_project.values()
+        for c in convos
+        if c["status"] == "trivial"
+    )
+    extracted = total - summarized - trivial
+
+    lines = [
+        "---",
+        "title: Conversation Index",
+        "type: index",
+        f"last_updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d')}",
+        "---",
+        "",
+        "# Conversation Index",
+        "",
+        f"Mined conversations from Claude Code sessions, organized by project (wing).",
+        "",
+        f"**{total} conversations** — {summarized} summarized, {extracted} pending, {trivial} trivial.",
+        "",
+        "---",
+        "",
+    ]
+
+    for project_code in PROJECT_ORDER:
+        convos = by_project.get(project_code, [])
+        display_name = PROJECT_NAMES.get(project_code, project_code.upper())
+
+        lines.append(f"## {display_name}")
+        lines.append("")
+
+        if not convos:
+            lines.append("_No conversations mined yet._")
+            lines.append("")
+            continue
+
+        # Show summarized first, then extracted, skip trivial from listing
+        shown = 0
+        for c in convos:
+            if c["status"] == "trivial":
+                continue
+
+            status_tag = ""
+            if c["status"] == "extracted":
+                status_tag = " _(pending summary)_"
+
+            # Get summary line if summarized
+            summary_text = ""
+            if c["status"] == "summarized":
+                summary_text = f" — {get_summary_line(c['file'])}"
+
+            lines.append(
+                f"- [{c['title']}]({c['relative']})"
+                f" ({c['date']}, {c['messages']} msgs)"
+                f"{summary_text}{status_tag}"
+            )
+            shown += 1
+
+        trivial_count = len(convos) - shown
+        if trivial_count > 0:
+            lines.append(f"\n_{trivial_count} trivial session(s) not listed._")
+
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Context generation
+# ---------------------------------------------------------------------------
+
+
+def generate_wakeup(by_project: dict[str, list[dict[str, Any]]]) -> str:
+    """Generate context/wake-up.md from recent conversations."""
+    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    # Determine activity level per project
+    project_activity: dict[str, dict[str, Any]] = {}
+    for code in PROJECT_ORDER:
+        convos = by_project.get(code, [])
+        summarized = [c for c in convos if c["status"] == "summarized"]
+
+        if summarized:
+            latest = max(summarized, key=lambda c: c["date"])
+            last_date = latest["date"]
+            # Simple activity heuristic: sessions in last 7 days = active
+            try:
+                dt = datetime.strptime(last_date, "%Y-%m-%d")
+                days_ago = (datetime.now() - dt).days
+                if days_ago <= 7:
+                    status = "Active"
+                elif days_ago <= 30:
+                    status = "Quiet"
+                else:
+                    status = "Inactive"
+            except ValueError:
+                status = "Unknown"
+                last_date = "—"
+        else:
+            # Check extracted-only
+            if convos:
+                latest = max(convos, key=lambda c: c["date"])
+                last_date = latest["date"]
+                status = "Active" if latest["date"] >= today[:7] else "Quiet"
+            else:
+                status = "—"
+                last_date = "—"
+
+        project_activity[code] = {
+            "status": status,
+            "last_date": last_date,
+            "count": len(convos),
+        }
+
+    # Gather recent decisions across all projects
+    recent_decisions: list[tuple[str, str, str]] = []  # (date, project, decision)
+    for code, convos in by_project.items():
+        for c in convos:
+            if c["status"] != "summarized":
+                continue
+            for decision in get_decisions(c["file"]):
+                recent_decisions.append((c["date"], code, decision))
+
+    recent_decisions.sort(key=lambda x: x[0], reverse=True)
+    recent_decisions = recent_decisions[:10]  # Top 10 most recent
+
+    # Gather recent discoveries
+    recent_discoveries: list[tuple[str, str, str]] = []
+    for code, convos in by_project.items():
+        for c in convos:
+            if c["status"] != "summarized":
+                continue
+            for disc in get_discoveries(c["file"]):
+                recent_discoveries.append((c["date"], code, disc))
+
+    recent_discoveries.sort(key=lambda x: x[0], reverse=True)
+    recent_discoveries = recent_discoveries[:5]
+
+    lines = [
+        "---",
+        "title: Wake-Up Briefing",
+        "type: context",
+        f"last_updated: {today}",
+        "---",
+        "",
+        "# Wake-Up Briefing",
+        "",
+        "Auto-generated world state for AI session context.",
+        "",
+        "## Active Projects",
+        "",
+        "| Code | Project | Status | Last Activity | Sessions |",
+        "|------|---------|--------|---------------|----------|",
+    ]
+
+    for code in PROJECT_ORDER:
+        if code == "general":
+            continue  # Skip general from roster
+        info = project_activity.get(code, {"status": "—", "last_date": "—", "count": 0})
+        display = PROJECT_NAMES.get(code, code).split(" — ")[1] if " — " in PROJECT_NAMES.get(code, "") else code
+        lines.append(
+            f"| {code.upper()} | {display} | {info['status']} | {info['last_date']} | {info['count']} |"
+        )
+
+    lines.append("")
+
+    if recent_decisions:
+        lines.append("## Recent Decisions")
+        lines.append("")
+        for date, proj, decision in recent_decisions[:7]:
+            lines.append(f"- **[{proj.upper()}]** {decision} ({date})")
+        lines.append("")
+
+    if recent_discoveries:
+        lines.append("## Recent Discoveries")
+        lines.append("")
+        for date, proj, disc in recent_discoveries[:5]:
+            lines.append(f"- **[{proj.upper()}]** {disc} ({date})")
+        lines.append("")
+
+    if not recent_decisions and not recent_discoveries:
+        lines.append("## Recent Decisions")
+        lines.append("")
+        lines.append("_Populated after summarization runs._")
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def generate_concerns(by_project: dict[str, list[dict[str, Any]]]) -> str:
+    """Generate context/active-concerns.md from recent conversations."""
+    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    # For now, this is a template that gets populated as summaries accumulate.
+    # Future enhancement: parse "blockers", "open questions" from summaries.
+    lines = [
+        "---",
+        "title: Active Concerns",
+        "type: context",
+        f"last_updated: {today}",
+        "---",
+        "",
+        "# Active Concerns",
+        "",
+        "Auto-generated from recent conversations. Current blockers, deadlines, and open questions.",
+        "",
+    ]
+
+    # Count recent activity to give a sense of what's hot
+    active_projects: list[tuple[str, int]] = []
+    for code in PROJECT_ORDER:
+        convos = by_project.get(code, [])
+        recent = [c for c in convos if c["date"] >= today[:7]]  # This month
+        if recent:
+            active_projects.append((code, len(recent)))
+
+    if active_projects:
+        active_projects.sort(key=lambda x: x[1], reverse=True)
+        lines.append("## Current Focus Areas")
+        lines.append("")
+        for code, count in active_projects[:5]:
+            display = PROJECT_NAMES.get(code, code)
+            lines.append(f"- **{display}** — {count} session(s) this month")
+        lines.append("")
+
+    lines.extend([
+        "## Blockers",
+        "",
+        "_Populated from conversation analysis._",
+        "",
+        "## Open Questions",
+        "",
+        "_Populated from conversation analysis._",
+        "",
+    ])
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Update conversation index and context files",
+    )
+    parser.add_argument(
+        "--reindex",
+        action="store_true",
+        help="Also trigger qmd update and embed after updating files",
+    )
+    args = parser.parse_args()
+
+    # Discover all conversations
+    by_project = discover_conversations()
+
+    total = sum(len(v) for v in by_project.values())
+    print(f"Found {total} conversation(s) across {len(by_project)} projects.")
+
+    # Generate and write index
+    index_content = generate_index(by_project)
+    INDEX_FILE.parent.mkdir(parents=True, exist_ok=True)
+    INDEX_FILE.write_text(index_content)
+    print(f"Updated {INDEX_FILE.relative_to(WIKI_DIR)}")
+
+    # Generate and write context files (create dir if needed)
+    WAKEUP_FILE.parent.mkdir(parents=True, exist_ok=True)
+    wakeup_content = generate_wakeup(by_project)
+    WAKEUP_FILE.write_text(wakeup_content)
+    print(f"Updated {WAKEUP_FILE.relative_to(WIKI_DIR)}")
+
+    concerns_content = generate_concerns(by_project)
+    CONCERNS_FILE.write_text(concerns_content)
+    print(f"Updated {CONCERNS_FILE.relative_to(WIKI_DIR)}")
+
+    # Optionally trigger qmd reindex
+    if args.reindex:
+        print("Triggering qmd reindex...")
+        try:
+            subprocess.run(["qmd", "update"], check=True, capture_output=True)
+            subprocess.run(["qmd", "embed"], check=True, capture_output=True)
+            print("qmd index updated.")
+        except FileNotFoundError:
+            print("qmd not found — skipping reindex.", file=sys.stderr)
+        except subprocess.CalledProcessError as e:
+            print(f"qmd reindex failed: {e}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()