Initial commit — memex

A compounding LLM-maintained knowledge wiki. Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's mempalace, with an automation layer on top for conversation mining, URL harvesting, human-in-the-loop staging, staleness decay, and hygiene. Includes: - 11 pipeline scripts (extract, summarize, index, harvest, stage, hygiene, maintain, sync, + shared library) - Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE - Example CLAUDE.md files (wiki schema + global instructions) tuned for the three-collection qmd setup - 171-test pytest suite (cross-platform, runs in ~1.3s) - MIT licensed
2026-04-12 21:16:02 -06:00
commit ee54a2f5d4
31 changed files with 10792 additions and 0 deletions
--- a/scripts/extract-sessions.py
+++ b/scripts/extract-sessions.py
@@ -0,0 +1,810 @@
+#!/usr/bin/env python3
+"""Extract Claude Code session JSONL files into clean markdown transcripts.
+
+Phase A of the conversation mining pipeline. Deterministic, no LLM dependency.
+Handles incremental extraction via byte offset tracking for sessions that span
+hours or days.
+
+Usage:
+    python3 extract-sessions.py                    # Extract all new sessions
+    python3 extract-sessions.py --project mc       # Extract one project
+    python3 extract-sessions.py --session 0a543572 # Extract specific session
+    python3 extract-sessions.py --dry-run          # Show what would be extracted
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+CLAUDE_PROJECTS_DIR = Path(os.environ.get("CLAUDE_PROJECTS_DIR", str(Path.home() / ".claude" / "projects")))
+WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki")))
+CONVERSATIONS_DIR = WIKI_DIR / "conversations"
+MINE_STATE_FILE = WIKI_DIR / ".mine-state.json"
+
+# ════════════════════════════════════════════════════════════════════════════
+# CONFIGURE ME — Map Claude project directory suffixes to wiki project codes
+# ════════════════════════════════════════════════════════════════════════════
+#
+# Claude Code stores sessions under ~/.claude/projects/<hashed-path>/. The
+# directory name is derived from the absolute path of your project, so it
+# looks like `-Users-alice-projects-myapp` or `-home-alice-projects-myapp`.
+#
+# This map tells the extractor which suffix maps to which short wiki code
+# (the "wing"). More specific suffixes should appear first — the extractor
+# picks the first match. Everything unmatched goes into `general/`.
+#
+# Examples — replace with your own projects:
+PROJECT_MAP: dict[str, str] = {
+    # More specific suffixes first
+    "projects-wiki": "wiki",          # this wiki itself
+    "-claude": "cl",                  # ~/.claude config repo
+    # Add your real projects here:
+    # "my-webapp": "web",
+    # "my-mobile-app": "mob",
+    # "work-mono-repo": "work",
+    # Catch-all — Claude sessions outside any tracked project
+    "-home": "general",
+    "-Users": "general",
+}
+
+# Tool call names to keep full output for
+KEEP_FULL_OUTPUT_TOOLS = {"Bash", "Skill"}
+
+# Tool call names to summarize (just note what was accessed)
+SUMMARIZE_TOOLS = {"Read", "Glob", "Grep"}
+
+# Tool call names to keep with path + change summary
+KEEP_CHANGE_TOOLS = {"Edit", "Write"}
+
+# Tool call names to keep description + result summary
+KEEP_SUMMARY_TOOLS = {"Agent"}
+
+# Max lines of Bash output to keep
+MAX_BASH_OUTPUT_LINES = 200
+
+# ---------------------------------------------------------------------------
+# State management
+# ---------------------------------------------------------------------------
+
+
+def load_state() -> dict[str, Any]:
+    """Load mining state from .mine-state.json."""
+    if MINE_STATE_FILE.exists():
+        with open(MINE_STATE_FILE) as f:
+            return json.load(f)
+    return {"sessions": {}, "last_run": None}
+
+
+def save_state(state: dict[str, Any]) -> None:
+    """Save mining state to .mine-state.json."""
+    state["last_run"] = datetime.now(timezone.utc).isoformat()
+    with open(MINE_STATE_FILE, "w") as f:
+        json.dump(state, f, indent=2)
+
+
+# ---------------------------------------------------------------------------
+# Project mapping
+# ---------------------------------------------------------------------------
+
+
+def resolve_project_code(dir_name: str) -> str | None:
+    """Map a Claude project directory name to a wiki project code.
+
+    Directory names look like: -Users-alice-projects-myapp or -home-alice-projects-myapp
+    """
+    for suffix, code in PROJECT_MAP.items():
+        if dir_name.endswith(suffix):
+            return code
+    return None
+
+
+def discover_sessions(
+    project_filter: str | None = None,
+    session_filter: str | None = None,
+) -> list[dict[str, Any]]:
+    """Discover JSONL session files from Claude projects directory."""
+    sessions = []
+
+    if not CLAUDE_PROJECTS_DIR.exists():
+        print(f"Claude projects directory not found: {CLAUDE_PROJECTS_DIR}", file=sys.stderr)
+        return sessions
+
+    for proj_dir in sorted(CLAUDE_PROJECTS_DIR.iterdir()):
+        if not proj_dir.is_dir():
+            continue
+
+        code = resolve_project_code(proj_dir.name)
+        if code is None:
+            continue
+
+        if project_filter and code != project_filter:
+            continue
+
+        for jsonl_file in sorted(proj_dir.glob("*.jsonl")):
+            session_id = jsonl_file.stem
+            if session_filter and not session_id.startswith(session_filter):
+                continue
+
+            sessions.append({
+                "session_id": session_id,
+                "project": code,
+                "jsonl_path": jsonl_file,
+                "file_size": jsonl_file.stat().st_size,
+            })
+
+    return sessions
+
+
+# ---------------------------------------------------------------------------
+# JSONL parsing and filtering
+# ---------------------------------------------------------------------------
+
+
+def extract_timestamp(obj: dict[str, Any]) -> str | None:
+    """Get timestamp from a JSONL record."""
+    ts = obj.get("timestamp")
+    if isinstance(ts, str):
+        return ts
+    if isinstance(ts, (int, float)):
+        return datetime.fromtimestamp(ts / 1000, tz=timezone.utc).isoformat()
+    return None
+
+
+def extract_session_date(obj: dict[str, Any]) -> str:
+    """Get date string (YYYY-MM-DD) from a JSONL record timestamp."""
+    ts = extract_timestamp(obj)
+    if ts:
+        try:
+            dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
+            return dt.strftime("%Y-%m-%d")
+        except (ValueError, TypeError):
+            pass
+    return datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+
+def truncate_lines(text: str, max_lines: int) -> str:
+    """Truncate text to max_lines, adding a note if truncated."""
+    lines = text.splitlines()
+    if len(lines) <= max_lines:
+        return text
+    kept = lines[:max_lines]
+    omitted = len(lines) - max_lines
+    kept.append(f"\n[... {omitted} lines truncated ...]")
+    return "\n".join(kept)
+
+
+def format_tool_use(name: str, input_data: dict[str, Any]) -> str | None:
+    """Format a tool_use content block for the transcript."""
+    if name in KEEP_FULL_OUTPUT_TOOLS:
+        if name == "Bash":
+            cmd = input_data.get("command", "")
+            desc = input_data.get("description", "")
+            label = desc if desc else cmd[:100]
+            return f"**[Bash]**: `{label}`"
+        if name == "Skill":
+            skill = input_data.get("skill", "")
+            args = input_data.get("args", "")
+            return f"**[Skill]**: /{skill} {args}".strip()
+
+    if name in SUMMARIZE_TOOLS:
+        if name == "Read":
+            fp = input_data.get("file_path", "?")
+            return f"[Read: {fp}]"
+        if name == "Glob":
+            pattern = input_data.get("pattern", "?")
+            return f"[Glob: {pattern}]"
+        if name == "Grep":
+            pattern = input_data.get("pattern", "?")
+            path = input_data.get("path", "")
+            return f"[Grep: '{pattern}' in {path}]" if path else f"[Grep: '{pattern}']"
+
+    if name in KEEP_CHANGE_TOOLS:
+        if name == "Edit":
+            fp = input_data.get("file_path", "?")
+            old = input_data.get("old_string", "")[:60]
+            return f"**[Edit]**: {fp} — replaced '{old}...'"
+        if name == "Write":
+            fp = input_data.get("file_path", "?")
+            content_len = len(input_data.get("content", ""))
+            return f"**[Write]**: {fp} ({content_len} chars)"
+
+    if name in KEEP_SUMMARY_TOOLS:
+        if name == "Agent":
+            desc = input_data.get("description", "?")
+            return f"**[Agent]**: {desc}"
+
+    if name == "ToolSearch":
+        return None  # noise
+    if name == "TaskCreate":
+        subj = input_data.get("subject", "?")
+        return f"[TaskCreate: {subj}]"
+    if name == "TaskUpdate":
+        tid = input_data.get("taskId", "?")
+        status = input_data.get("status", "?")
+        return f"[TaskUpdate: #{tid} → {status}]"
+
+    # Default: note the tool was called
+    return f"[{name}]"
+
+
+def format_tool_result(
+    tool_name: str | None,
+    content: Any,
+    is_error: bool = False,
+) -> str | None:
+    """Format a tool_result content block for the transcript."""
+    text = ""
+    if isinstance(content, str):
+        text = content
+    elif isinstance(content, list):
+        parts = []
+        for item in content:
+            if isinstance(item, dict) and item.get("type") == "text":
+                parts.append(item.get("text", ""))
+        text = "\n".join(parts)
+
+    if not text.strip():
+        return None
+
+    if is_error:
+        return f"**[ERROR]**:\n```\n{truncate_lines(text, MAX_BASH_OUTPUT_LINES)}\n```"
+
+    if tool_name in KEEP_FULL_OUTPUT_TOOLS:
+        return f"```\n{truncate_lines(text, MAX_BASH_OUTPUT_LINES)}\n```"
+
+    if tool_name in SUMMARIZE_TOOLS:
+        # Just note the result size
+        line_count = len(text.splitlines())
+        char_count = len(text)
+        return f"[→ {line_count} lines, {char_count} chars]"
+
+    if tool_name in KEEP_CHANGE_TOOLS:
+        return None  # The tool_use already captured what changed
+
+    if tool_name in KEEP_SUMMARY_TOOLS:
+        # Keep a summary of agent results
+        summary = text[:300]
+        if len(text) > 300:
+            summary += "..."
+        return f"> {summary}"
+
+    return None
+
+
+def parse_content_blocks(
+    content: list[dict[str, Any]],
+    role: str,
+    tool_id_to_name: dict[str, str],
+) -> list[str]:
+    """Parse content blocks from a message into transcript lines."""
+    parts: list[str] = []
+
+    for block in content:
+        block_type = block.get("type")
+
+        if block_type == "text":
+            text = block.get("text", "").strip()
+            if not text:
+                continue
+            # Skip system-reminder content
+            if "<system-reminder>" in text:
+                # Strip system reminder tags and their content
+                text = re.sub(
+                    r"<system-reminder>.*?</system-reminder>",
+                    "",
+                    text,
+                    flags=re.DOTALL,
+                ).strip()
+            # Skip local-command noise
+            if text.startswith("<local-command"):
+                continue
+            if text:
+                parts.append(text)
+
+        elif block_type == "thinking":
+            # Skip thinking blocks
+            continue
+
+        elif block_type == "tool_use":
+            tool_name = block.get("name", "unknown")
+            tool_id = block.get("id", "")
+            input_data = block.get("input", {})
+            tool_id_to_name[tool_id] = tool_name
+            formatted = format_tool_use(tool_name, input_data)
+            if formatted:
+                parts.append(formatted)
+
+        elif block_type == "tool_result":
+            tool_id = block.get("tool_use_id", "")
+            tool_name = tool_id_to_name.get(tool_id)
+            is_error = block.get("is_error", False)
+            result_content = block.get("content", "")
+            formatted = format_tool_result(tool_name, result_content, is_error)
+            if formatted:
+                parts.append(formatted)
+
+    return parts
+
+
+def process_jsonl(
+    jsonl_path: Path,
+    byte_offset: int = 0,
+) -> tuple[list[str], dict[str, Any]]:
+    """Process a JSONL session file and return transcript lines + metadata.
+
+    Args:
+        jsonl_path: Path to the JSONL file
+        byte_offset: Start reading from this byte position (for incremental)
+
+    Returns:
+        Tuple of (transcript_lines, metadata_dict)
+    """
+    transcript_lines: list[str] = []
+    metadata: dict[str, Any] = {
+        "first_date": None,
+        "last_date": None,
+        "message_count": 0,
+        "human_messages": 0,
+        "assistant_messages": 0,
+        "git_branch": None,
+        "new_byte_offset": 0,
+    }
+
+    # Map tool_use IDs to tool names for correlating results
+    tool_id_to_name: dict[str, str] = {}
+
+    # Track when a command/skill was just invoked so the next user message
+    # (the skill prompt injection) gets labeled correctly
+    last_command_name: str | None = None
+
+    with open(jsonl_path, "rb") as f:
+        if byte_offset > 0:
+            f.seek(byte_offset)
+
+        for raw_line in f:
+            try:
+                obj = json.loads(raw_line)
+            except json.JSONDecodeError:
+                continue
+
+            record_type = obj.get("type")
+
+            # Skip non-message types
+            if record_type not in ("user", "assistant"):
+                continue
+
+            msg = obj.get("message", {})
+            role = msg.get("role", record_type)
+            content = msg.get("content", "")
+
+            # Track metadata
+            date = extract_session_date(obj)
+            if metadata["first_date"] is None:
+                metadata["first_date"] = date
+            metadata["last_date"] = date
+            metadata["message_count"] += 1
+
+            if not metadata["git_branch"]:
+                metadata["git_branch"] = obj.get("gitBranch")
+
+            if role == "user":
+                metadata["human_messages"] += 1
+            elif role == "assistant":
+                metadata["assistant_messages"] += 1
+
+            # Process content
+            if isinstance(content, str):
+                text = content.strip()
+                # Skip system-reminder and local-command noise
+                if "<system-reminder>" in text:
+                    text = re.sub(
+                        r"<system-reminder>.*?</system-reminder>",
+                        "",
+                        text,
+                        flags=re.DOTALL,
+                    ).strip()
+                if text.startswith("<local-command"):
+                    continue
+                if text.startswith("<command-name>/exit"):
+                    continue
+
+                # Detect command/skill invocation: <command-name>/foo</command-name>
+                cmd_match = re.search(
+                    r"<command-name>/([^<]+)</command-name>", text,
+                )
+                if cmd_match:
+                    last_command_name = cmd_match.group(1)
+                    # Keep just a brief note about the command invocation
+                    transcript_lines.append(
+                        f"**Human**: /{last_command_name}"
+                    )
+                    transcript_lines.append("")
+                    continue
+
+                # Detect skill prompt injection (large structured text after a command)
+                if (
+                    last_command_name
+                    and role == "user"
+                    and len(text) > 500
+                ):
+                    # This is the skill's injected prompt — summarize it
+                    transcript_lines.append(
+                        f"[Skill prompt: /{last_command_name} — {len(text)} chars]"
+                    )
+                    transcript_lines.append("")
+                    last_command_name = None
+                    continue
+
+                # Also detect skill prompts by content pattern (catches cases
+                # where the command-name message wasn't separate, or where the
+                # prompt arrives without a preceding command-name tag)
+                if (
+                    role == "user"
+                    and len(text) > 500
+                    and re.match(
+                        r"^##\s*(Tracking|Step|Context|Instructions|Overview|Goal)",
+                        text,
+                    )
+                ):
+                    # Structured skill prompt — try to extract command name
+                    cmd_in_text = re.search(
+                        r'--command\s+"([^"]+)"', text,
+                    )
+                    prompt_label = cmd_in_text.group(1) if cmd_in_text else (last_command_name or "unknown")
+                    transcript_lines.append(
+                        f"[Skill prompt: /{prompt_label} — {len(text)} chars]"
+                    )
+                    transcript_lines.append("")
+                    last_command_name = None
+                    continue
+
+                last_command_name = None  # Reset after non-matching message
+
+                if text:
+                    label = "**Human**" if role == "user" else "**Assistant**"
+                    transcript_lines.append(f"{label}: {text}")
+                    transcript_lines.append("")
+
+            elif isinstance(content, list):
+                # Check if this is a skill prompt in list form
+                is_skill_prompt = False
+                skill_prompt_name = last_command_name
+                if role == "user":
+                    for block in content:
+                        if block.get("type") == "text":
+                            block_text = block.get("text", "").strip()
+                            # Detect by preceding command name
+                            if last_command_name and len(block_text) > 500:
+                                is_skill_prompt = True
+                                break
+                            # Detect by content pattern (## Tracking, etc.)
+                            if (
+                                len(block_text) > 500
+                                and re.match(
+                                    r"^##\s*(Tracking|Step|Context|Instructions|Overview|Goal)",
+                                    block_text,
+                                )
+                            ):
+                                is_skill_prompt = True
+                                # Try to extract command name from content
+                                cmd_in_text = re.search(
+                                    r'--command\s+"([^"]+)"', block_text,
+                                )
+                                if cmd_in_text:
+                                    skill_prompt_name = cmd_in_text.group(1)
+                                break
+
+                if is_skill_prompt:
+                    total_len = sum(
+                        len(b.get("text", ""))
+                        for b in content
+                        if b.get("type") == "text"
+                    )
+                    label = skill_prompt_name or "unknown"
+                    transcript_lines.append(
+                        f"[Skill prompt: /{label} — {total_len} chars]"
+                    )
+                    transcript_lines.append("")
+                    last_command_name = None
+                    continue
+
+                last_command_name = None
+
+                parts = parse_content_blocks(content, role, tool_id_to_name)
+                if parts:
+                    # Determine if this is a tool result message (user role but
+                    # contains only tool_result blocks — these are tool outputs,
+                    # not human input)
+                    has_only_tool_results = all(
+                        b.get("type") in ("tool_result",)
+                        for b in content
+                        if b.get("type") != "text" or b.get("text", "").strip()
+                    ) and any(b.get("type") == "tool_result" for b in content)
+
+                    if has_only_tool_results:
+                        # Tool results — no speaker label, just the formatted output
+                        for part in parts:
+                            transcript_lines.append(part)
+                    elif role == "user":
+                        # Check if there's actual human text (not just tool results)
+                        has_human_text = any(
+                            b.get("type") == "text"
+                            and b.get("text", "").strip()
+                            and "<system-reminder>" not in b.get("text", "")
+                            for b in content
+                        )
+                        label = "**Human**" if has_human_text else "**Assistant**"
+                        if len(parts) == 1:
+                            transcript_lines.append(f"{label}: {parts[0]}")
+                        else:
+                            transcript_lines.append(f"{label}:")
+                            for part in parts:
+                                transcript_lines.append(part)
+                    else:
+                        label = "**Assistant**"
+                        if len(parts) == 1:
+                            transcript_lines.append(f"{label}: {parts[0]}")
+                        else:
+                            transcript_lines.append(f"{label}:")
+                            for part in parts:
+                                transcript_lines.append(part)
+                    transcript_lines.append("")
+
+        metadata["new_byte_offset"] = f.tell()
+
+    return transcript_lines, metadata
+
+
+# ---------------------------------------------------------------------------
+# Markdown generation
+# ---------------------------------------------------------------------------
+
+
+def build_frontmatter(
+    session_id: str,
+    project: str,
+    date: str,
+    message_count: int,
+    git_branch: str | None = None,
+) -> str:
+    """Build YAML frontmatter for a conversation markdown file."""
+    lines = [
+        "---",
+        f"title: Session {session_id[:8]}",
+        "type: conversation",
+        f"project: {project}",
+        f"date: {date}",
+        f"session_id: {session_id}",
+        f"messages: {message_count}",
+        "status: extracted",
+    ]
+    if git_branch:
+        lines.append(f"git_branch: {git_branch}")
+    lines.append("---")
+    return "\n".join(lines)
+
+
+def write_new_conversation(
+    output_path: Path,
+    session_id: str,
+    project: str,
+    transcript_lines: list[str],
+    metadata: dict[str, Any],
+) -> None:
+    """Write a new conversation markdown file."""
+    date = metadata["first_date"] or datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    frontmatter = build_frontmatter(
+        session_id=session_id,
+        project=project,
+        date=date,
+        message_count=metadata["message_count"],
+        git_branch=metadata.get("git_branch"),
+    )
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write(frontmatter)
+        f.write("\n\n## Transcript\n\n")
+        f.write("\n".join(transcript_lines))
+        f.write("\n")
+
+
+def append_to_conversation(
+    output_path: Path,
+    transcript_lines: list[str],
+    new_message_count: int,
+) -> None:
+    """Append new transcript content to an existing conversation file.
+
+    Updates the message count in frontmatter and appends new transcript lines.
+    """
+    content = output_path.read_text()
+
+    # Update message count in frontmatter
+    content = re.sub(
+        r"^messages: \d+$",
+        f"messages: {new_message_count}",
+        content,
+        count=1,
+        flags=re.MULTILINE,
+    )
+
+    # Add last_updated
+    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    if "last_updated:" in content:
+        content = re.sub(
+            r"^last_updated: .+$",
+            f"last_updated: {today}",
+            content,
+            count=1,
+            flags=re.MULTILINE,
+        )
+    else:
+        content = content.replace(
+            "\nstatus: extracted",
+            f"\nlast_updated: {today}\nstatus: extracted",
+        )
+
+    # Append new transcript
+    with open(output_path, "w") as f:
+        f.write(content)
+        if not content.endswith("\n"):
+            f.write("\n")
+        f.write("\n".join(transcript_lines))
+        f.write("\n")
+
+
+# ---------------------------------------------------------------------------
+# Main extraction logic
+# ---------------------------------------------------------------------------
+
+
+def extract_session(
+    session_info: dict[str, Any],
+    state: dict[str, Any],
+    dry_run: bool = False,
+) -> bool:
+    """Extract a single session. Returns True if work was done."""
+    session_id = session_info["session_id"]
+    project = session_info["project"]
+    jsonl_path = session_info["jsonl_path"]
+    file_size = session_info["file_size"]
+
+    # Check state for prior extraction
+    session_state = state["sessions"].get(session_id, {})
+    last_offset = session_state.get("byte_offset", 0)
+
+    # Skip if no new content
+    if file_size <= last_offset:
+        return False
+
+    is_incremental = last_offset > 0
+
+    if dry_run:
+        mode = "append" if is_incremental else "new"
+        new_bytes = file_size - last_offset
+        print(f"  [{mode}] {project}/{session_id[:8]} — {new_bytes:,} new bytes")
+        return True
+
+    # Parse the JSONL
+    transcript_lines, metadata = process_jsonl(jsonl_path, byte_offset=last_offset)
+
+    if not transcript_lines:
+        # Update offset even if no extractable content
+        state["sessions"][session_id] = {
+            "project": project,
+            "byte_offset": metadata["new_byte_offset"],
+            "message_count": session_state.get("message_count", 0),
+            "last_extracted": datetime.now(timezone.utc).isoformat(),
+            "summarized_through_msg": session_state.get("summarized_through_msg", 0),
+        }
+        return False
+
+    # Determine output path
+    date = metadata["first_date"] or datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    if is_incremental:
+        # Use existing output file
+        output_file = session_state.get("output_file", "")
+        output_path = WIKI_DIR / output_file if output_file else None
+    else:
+        output_path = None
+
+    if output_path is None or not output_path.exists():
+        filename = f"{date}-{session_id[:8]}.md"
+        output_path = CONVERSATIONS_DIR / project / filename
+
+    # Write or append
+    total_messages = session_state.get("message_count", 0) + metadata["message_count"]
+
+    if is_incremental and output_path.exists():
+        append_to_conversation(output_path, transcript_lines, total_messages)
+        print(f"  [append] {project}/{output_path.name} — +{metadata['message_count']} messages")
+    else:
+        write_new_conversation(output_path, session_id, project, transcript_lines, metadata)
+        print(f"  [new]    {project}/{output_path.name} — {metadata['message_count']} messages")
+
+    # Update state
+    state["sessions"][session_id] = {
+        "project": project,
+        "output_file": str(output_path.relative_to(WIKI_DIR)),
+        "byte_offset": metadata["new_byte_offset"],
+        "message_count": total_messages,
+        "last_extracted": datetime.now(timezone.utc).isoformat(),
+        "summarized_through_msg": session_state.get("summarized_through_msg", 0),
+    }
+
+    return True
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Extract Claude Code sessions into markdown transcripts",
+    )
+    parser.add_argument(
+        "--project",
+        help="Only extract sessions for this project code (e.g., mc, if, lp)",
+    )
+    parser.add_argument(
+        "--session",
+        help="Only extract this specific session (prefix match on session ID)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be extracted without writing files",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Re-extract from the beginning, ignoring saved byte offsets",
+    )
+    args = parser.parse_args()
+
+    state = load_state()
+
+    if args.force:
+        # Reset all byte offsets
+        for sid in state["sessions"]:
+            state["sessions"][sid]["byte_offset"] = 0
+
+    # Discover sessions
+    sessions = discover_sessions(
+        project_filter=args.project,
+        session_filter=args.session,
+    )
+
+    if not sessions:
+        print("No sessions found matching filters.")
+        return
+
+    print(f"Found {len(sessions)} session(s) to check...")
+    if args.dry_run:
+        print("DRY RUN — no files will be written\n")
+
+    extracted = 0
+    for session_info in sessions:
+        if extract_session(session_info, state, dry_run=args.dry_run):
+            extracted += 1
+
+    if extracted == 0:
+        print("No new content to extract.")
+    else:
+        print(f"\nExtracted {extracted} session(s).")
+
+    if not args.dry_run:
+        save_state(state)
+
+
+if __name__ == "__main__":
+    main()