memex/scripts/update-conversation-index.py

#!/usr/bin/env python3
"""Update conversation index and context files from summarized conversations.

Phase C of the conversation mining pipeline. Reads all conversation markdown
files and regenerates:
  - conversations/index.md — catalog organized by project
  - context/wake-up.md — world briefing from recent conversations
  - context/active-concerns.md — current blockers and open threads

Usage:
    python3 update-conversation-index.py
    python3 update-conversation-index.py --reindex    # Also triggers qmd update
"""

from __future__ import annotations

import argparse
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki")))
CONVERSATIONS_DIR = WIKI_DIR / "conversations"
CONTEXT_DIR = WIKI_DIR / "context"
INDEX_FILE = CONVERSATIONS_DIR / "index.md"
WAKEUP_FILE = CONTEXT_DIR / "wake-up.md"
CONCERNS_FILE = CONTEXT_DIR / "active-concerns.md"

# ════════════════════════════════════════════════════════════════════════════
# CONFIGURE ME — Project code to display name mapping
# ════════════════════════════════════════════════════════════════════════════
#
# Every project code you use in `extract-sessions.py`'s PROJECT_MAP should
# have a display name here. The conversation index groups conversations by
# these codes and renders them under sections named by the display name.
#
# Examples — replace with your own:
PROJECT_NAMES: dict[str, str] = {
    "wiki": "WIKI — This Wiki",
    "cl": "CL — Claude Config",
    # "web": "WEB — My Webapp",
    # "mob": "MOB — My Mobile App",
    # "work": "WORK — Day Job",
    "general": "General — Cross-Project",
}

# Order for display — put your most-active projects first
PROJECT_ORDER = [
    # "work", "web", "mob",
    "wiki", "cl", "general",
]


# ---------------------------------------------------------------------------
# Frontmatter parsing
# ---------------------------------------------------------------------------


def parse_frontmatter(file_path: Path) -> dict[str, str]:
    """Parse YAML frontmatter from a markdown file."""
    fm: dict[str, str] = {}
    content = file_path.read_text()

    # Find frontmatter between --- markers
    match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
    if not match:
        return fm

    for line in match.group(1).splitlines():
        if ":" in line:
            key, _, value = line.partition(":")
            fm[key.strip()] = value.strip()

    return fm


def get_summary_line(file_path: Path) -> str:
    """Extract the first sentence of the Summary section."""
    content = file_path.read_text()
    match = re.search(r"## Summary\n\n(.+?)(?:\n\n|\n##)", content, re.DOTALL)
    if match:
        summary = match.group(1).strip()
        # First sentence
        first_sentence = summary.split(". ")[0]
        if not first_sentence.endswith("."):
            first_sentence += "."
        # Truncate if too long
        if len(first_sentence) > 120:
            first_sentence = first_sentence[:117] + "..."
        return first_sentence
    return "No summary available."


def get_decisions(file_path: Path) -> list[str]:
    """Extract decisions from a conversation file."""
    content = file_path.read_text()
    decisions: list[str] = []
    match = re.search(r"## Decisions.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL)
    if match:
        for line in match.group(1).strip().splitlines():
            line = line.strip()
            if line.startswith("- "):
                decisions.append(line[2:])
    return decisions


def get_discoveries(file_path: Path) -> list[str]:
    """Extract discoveries from a conversation file."""
    content = file_path.read_text()
    discoveries: list[str] = []
    match = re.search(r"## Discoveries.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL)
    if match:
        for line in match.group(1).strip().splitlines():
            line = line.strip()
            if line.startswith("- "):
                discoveries.append(line[2:])
    return discoveries


# ---------------------------------------------------------------------------
# Conversation discovery
# ---------------------------------------------------------------------------


def discover_conversations() -> dict[str, list[dict[str, Any]]]:
    """Discover all conversation files organized by project."""
    by_project: dict[str, list[dict[str, Any]]] = defaultdict(list)

    for project_dir in sorted(CONVERSATIONS_DIR.iterdir()):
        if not project_dir.is_dir():
            continue

        project_code = project_dir.name
        if project_code not in PROJECT_NAMES:
            continue

        for md_file in sorted(project_dir.glob("*.md"), reverse=True):
            if md_file.name == ".gitkeep":
                continue

            fm = parse_frontmatter(md_file)
            status = fm.get("status", "extracted")

            entry = {
                "file": md_file,
                "relative": md_file.relative_to(CONVERSATIONS_DIR),
                "title": fm.get("title", md_file.stem),
                "date": fm.get("date", "unknown"),
                "status": status,
                "messages": fm.get("messages", "0"),
                "halls": fm.get("halls", ""),
                "topics": fm.get("topics", ""),
                "project": project_code,
            }

            by_project[project_code].append(entry)

    return by_project


# ---------------------------------------------------------------------------
# Index generation
# ---------------------------------------------------------------------------


def generate_index(by_project: dict[str, list[dict[str, Any]]]) -> str:
    """Generate the conversations/index.md content."""
    total = sum(len(convos) for convos in by_project.values())
    summarized = sum(
        1
        for convos in by_project.values()
        for c in convos
        if c["status"] == "summarized"
    )
    trivial = sum(
        1
        for convos in by_project.values()
        for c in convos
        if c["status"] == "trivial"
    )
    extracted = total - summarized - trivial

    lines = [
        "---",
        "title: Conversation Index",
        "type: index",
        f"last_updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d')}",
        "---",
        "",
        "# Conversation Index",
        "",
        f"Mined conversations from Claude Code sessions, organized by project (wing).",
        "",
        f"**{total} conversations** — {summarized} summarized, {extracted} pending, {trivial} trivial.",
        "",
        "---",
        "",
    ]

    for project_code in PROJECT_ORDER:
        convos = by_project.get(project_code, [])
        display_name = PROJECT_NAMES.get(project_code, project_code.upper())

        lines.append(f"## {display_name}")
        lines.append("")

        if not convos:
            lines.append("_No conversations mined yet._")
            lines.append("")
            continue

        # Show summarized first, then extracted, skip trivial from listing
        shown = 0
        for c in convos:
            if c["status"] == "trivial":
                continue

            status_tag = ""
            if c["status"] == "extracted":
                status_tag = " _(pending summary)_"

            # Get summary line if summarized
            summary_text = ""
            if c["status"] == "summarized":
                summary_text = f" — {get_summary_line(c['file'])}"

            lines.append(
                f"- [{c['title']}]({c['relative']})"
                f" ({c['date']}, {c['messages']} msgs)"
                f"{summary_text}{status_tag}"
            )
            shown += 1

        trivial_count = len(convos) - shown
        if trivial_count > 0:
            lines.append(f"\n_{trivial_count} trivial session(s) not listed._")

        lines.append("")

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Context generation
# ---------------------------------------------------------------------------


def generate_wakeup(by_project: dict[str, list[dict[str, Any]]]) -> str:
    """Generate context/wake-up.md from recent conversations."""
    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")

    # Determine activity level per project
    project_activity: dict[str, dict[str, Any]] = {}
    for code in PROJECT_ORDER:
        convos = by_project.get(code, [])
        summarized = [c for c in convos if c["status"] == "summarized"]

        if summarized:
            latest = max(summarized, key=lambda c: c["date"])
            last_date = latest["date"]
            # Simple activity heuristic: sessions in last 7 days = active
            try:
                dt = datetime.strptime(last_date, "%Y-%m-%d")
                days_ago = (datetime.now() - dt).days
                if days_ago <= 7:
                    status = "Active"
                elif days_ago <= 30:
                    status = "Quiet"
                else:
                    status = "Inactive"
            except ValueError:
                status = "Unknown"
                last_date = "—"
        else:
            # Check extracted-only
            if convos:
                latest = max(convos, key=lambda c: c["date"])
                last_date = latest["date"]
                status = "Active" if latest["date"] >= today[:7] else "Quiet"
            else:
                status = "—"
                last_date = "—"

        project_activity[code] = {
            "status": status,
            "last_date": last_date,
            "count": len(convos),
        }

    # Gather recent decisions across all projects
    recent_decisions: list[tuple[str, str, str]] = []  # (date, project, decision)
    for code, convos in by_project.items():
        for c in convos:
            if c["status"] != "summarized":
                continue
            for decision in get_decisions(c["file"]):
                recent_decisions.append((c["date"], code, decision))

    recent_decisions.sort(key=lambda x: x[0], reverse=True)
    recent_decisions = recent_decisions[:10]  # Top 10 most recent

    # Gather recent discoveries
    recent_discoveries: list[tuple[str, str, str]] = []
    for code, convos in by_project.items():
        for c in convos:
            if c["status"] != "summarized":
                continue
            for disc in get_discoveries(c["file"]):
                recent_discoveries.append((c["date"], code, disc))

    recent_discoveries.sort(key=lambda x: x[0], reverse=True)
    recent_discoveries = recent_discoveries[:5]

    lines = [
        "---",
        "title: Wake-Up Briefing",
        "type: context",
        f"last_updated: {today}",
        "---",
        "",
        "# Wake-Up Briefing",
        "",
        "Auto-generated world state for AI session context.",
        "",
        "## Active Projects",
        "",
        "| Code | Project | Status | Last Activity | Sessions |",
        "|------|---------|--------|---------------|----------|",
    ]

    for code in PROJECT_ORDER:
        if code == "general":
            continue  # Skip general from roster
        info = project_activity.get(code, {"status": "—", "last_date": "—", "count": 0})
        display = PROJECT_NAMES.get(code, code).split(" — ")[1] if " — " in PROJECT_NAMES.get(code, "") else code
        lines.append(
            f"| {code.upper()} | {display} | {info['status']} | {info['last_date']} | {info['count']} |"
        )

    lines.append("")

    if recent_decisions:
        lines.append("## Recent Decisions")
        lines.append("")
        for date, proj, decision in recent_decisions[:7]:
            lines.append(f"- **[{proj.upper()}]** {decision} ({date})")
        lines.append("")

    if recent_discoveries:
        lines.append("## Recent Discoveries")
        lines.append("")
        for date, proj, disc in recent_discoveries[:5]:
            lines.append(f"- **[{proj.upper()}]** {disc} ({date})")
        lines.append("")

    if not recent_decisions and not recent_discoveries:
        lines.append("## Recent Decisions")
        lines.append("")
        lines.append("_Populated after summarization runs._")
        lines.append("")

    return "\n".join(lines)


def generate_concerns(by_project: dict[str, list[dict[str, Any]]]) -> str:
    """Generate context/active-concerns.md from recent conversations."""
    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")

    # For now, this is a template that gets populated as summaries accumulate.
    # Future enhancement: parse "blockers", "open questions" from summaries.
    lines = [
        "---",
        "title: Active Concerns",
        "type: context",
        f"last_updated: {today}",
        "---",
        "",
        "# Active Concerns",
        "",
        "Auto-generated from recent conversations. Current blockers, deadlines, and open questions.",
        "",
    ]

    # Count recent activity to give a sense of what's hot
    active_projects: list[tuple[str, int]] = []
    for code in PROJECT_ORDER:
        convos = by_project.get(code, [])
        recent = [c for c in convos if c["date"] >= today[:7]]  # This month
        if recent:
            active_projects.append((code, len(recent)))

    if active_projects:
        active_projects.sort(key=lambda x: x[1], reverse=True)
        lines.append("## Current Focus Areas")
        lines.append("")
        for code, count in active_projects[:5]:
            display = PROJECT_NAMES.get(code, code)
            lines.append(f"- **{display}** — {count} session(s) this month")
        lines.append("")

    lines.extend([
        "## Blockers",
        "",
        "_Populated from conversation analysis._",
        "",
        "## Open Questions",
        "",
        "_Populated from conversation analysis._",
        "",
    ])

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Update conversation index and context files",
    )
    parser.add_argument(
        "--reindex",
        action="store_true",
        help="Also trigger qmd update and embed after updating files",
    )
    args = parser.parse_args()

    # Discover all conversations
    by_project = discover_conversations()

    total = sum(len(v) for v in by_project.values())
    print(f"Found {total} conversation(s) across {len(by_project)} projects.")

    # Generate and write index
    index_content = generate_index(by_project)
    INDEX_FILE.parent.mkdir(parents=True, exist_ok=True)
    INDEX_FILE.write_text(index_content)
    print(f"Updated {INDEX_FILE.relative_to(WIKI_DIR)}")

    # Generate and write context files (create dir if needed)
    WAKEUP_FILE.parent.mkdir(parents=True, exist_ok=True)
    wakeup_content = generate_wakeup(by_project)
    WAKEUP_FILE.write_text(wakeup_content)
    print(f"Updated {WAKEUP_FILE.relative_to(WIKI_DIR)}")

    concerns_content = generate_concerns(by_project)
    CONCERNS_FILE.write_text(concerns_content)
    print(f"Updated {CONCERNS_FILE.relative_to(WIKI_DIR)}")

    # Optionally trigger qmd reindex
    if args.reindex:
        print("Triggering qmd reindex...")
        try:
            subprocess.run(["qmd", "update"], check=True, capture_output=True)
            subprocess.run(["qmd", "embed"], check=True, capture_output=True)
            print("qmd index updated.")
        except FileNotFoundError:
            print("qmd not found — skipping reindex.", file=sys.stderr)
        except subprocess.CalledProcessError as e:
            print(f"qmd reindex failed: {e}", file=sys.stderr)


if __name__ == "__main__":
    main()