A compounding LLM-maintained knowledge wiki. Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's mempalace, with an automation layer on top for conversation mining, URL harvesting, human-in-the-loop staging, staleness decay, and hygiene. Includes: - 11 pipeline scripts (extract, summarize, index, harvest, stage, hygiene, maintain, sync, + shared library) - Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE - Example CLAUDE.md files (wiki schema + global instructions) tuned for the three-collection qmd setup - 171-test pytest suite (cross-platform, runs in ~1.3s) - MIT licensed
477 lines
16 KiB
Python
Executable File
477 lines
16 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Update conversation index and context files from summarized conversations.
|
|
|
|
Phase C of the conversation mining pipeline. Reads all conversation markdown
|
|
files and regenerates:
|
|
- conversations/index.md — catalog organized by project
|
|
- context/wake-up.md — world briefing from recent conversations
|
|
- context/active-concerns.md — current blockers and open threads
|
|
|
|
Usage:
|
|
python3 update-conversation-index.py
|
|
python3 update-conversation-index.py --reindex # Also triggers qmd update
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki")))
|
|
CONVERSATIONS_DIR = WIKI_DIR / "conversations"
|
|
CONTEXT_DIR = WIKI_DIR / "context"
|
|
INDEX_FILE = CONVERSATIONS_DIR / "index.md"
|
|
WAKEUP_FILE = CONTEXT_DIR / "wake-up.md"
|
|
CONCERNS_FILE = CONTEXT_DIR / "active-concerns.md"
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# CONFIGURE ME — Project code to display name mapping
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
#
|
|
# Every project code you use in `extract-sessions.py`'s PROJECT_MAP should
|
|
# have a display name here. The conversation index groups conversations by
|
|
# these codes and renders them under sections named by the display name.
|
|
#
|
|
# Examples — replace with your own:
|
|
PROJECT_NAMES: dict[str, str] = {
|
|
"wiki": "WIKI — This Wiki",
|
|
"cl": "CL — Claude Config",
|
|
# "web": "WEB — My Webapp",
|
|
# "mob": "MOB — My Mobile App",
|
|
# "work": "WORK — Day Job",
|
|
"general": "General — Cross-Project",
|
|
}
|
|
|
|
# Order for display — put your most-active projects first
|
|
PROJECT_ORDER = [
|
|
# "work", "web", "mob",
|
|
"wiki", "cl", "general",
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Frontmatter parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def parse_frontmatter(file_path: Path) -> dict[str, str]:
|
|
"""Parse YAML frontmatter from a markdown file."""
|
|
fm: dict[str, str] = {}
|
|
content = file_path.read_text()
|
|
|
|
# Find frontmatter between --- markers
|
|
match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
|
|
if not match:
|
|
return fm
|
|
|
|
for line in match.group(1).splitlines():
|
|
if ":" in line:
|
|
key, _, value = line.partition(":")
|
|
fm[key.strip()] = value.strip()
|
|
|
|
return fm
|
|
|
|
|
|
def get_summary_line(file_path: Path) -> str:
|
|
"""Extract the first sentence of the Summary section."""
|
|
content = file_path.read_text()
|
|
match = re.search(r"## Summary\n\n(.+?)(?:\n\n|\n##)", content, re.DOTALL)
|
|
if match:
|
|
summary = match.group(1).strip()
|
|
# First sentence
|
|
first_sentence = summary.split(". ")[0]
|
|
if not first_sentence.endswith("."):
|
|
first_sentence += "."
|
|
# Truncate if too long
|
|
if len(first_sentence) > 120:
|
|
first_sentence = first_sentence[:117] + "..."
|
|
return first_sentence
|
|
return "No summary available."
|
|
|
|
|
|
def get_decisions(file_path: Path) -> list[str]:
|
|
"""Extract decisions from a conversation file."""
|
|
content = file_path.read_text()
|
|
decisions: list[str] = []
|
|
match = re.search(r"## Decisions.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL)
|
|
if match:
|
|
for line in match.group(1).strip().splitlines():
|
|
line = line.strip()
|
|
if line.startswith("- "):
|
|
decisions.append(line[2:])
|
|
return decisions
|
|
|
|
|
|
def get_discoveries(file_path: Path) -> list[str]:
|
|
"""Extract discoveries from a conversation file."""
|
|
content = file_path.read_text()
|
|
discoveries: list[str] = []
|
|
match = re.search(r"## Discoveries.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL)
|
|
if match:
|
|
for line in match.group(1).strip().splitlines():
|
|
line = line.strip()
|
|
if line.startswith("- "):
|
|
discoveries.append(line[2:])
|
|
return discoveries
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Conversation discovery
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def discover_conversations() -> dict[str, list[dict[str, Any]]]:
|
|
"""Discover all conversation files organized by project."""
|
|
by_project: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
|
|
for project_dir in sorted(CONVERSATIONS_DIR.iterdir()):
|
|
if not project_dir.is_dir():
|
|
continue
|
|
|
|
project_code = project_dir.name
|
|
if project_code not in PROJECT_NAMES:
|
|
continue
|
|
|
|
for md_file in sorted(project_dir.glob("*.md"), reverse=True):
|
|
if md_file.name == ".gitkeep":
|
|
continue
|
|
|
|
fm = parse_frontmatter(md_file)
|
|
status = fm.get("status", "extracted")
|
|
|
|
entry = {
|
|
"file": md_file,
|
|
"relative": md_file.relative_to(CONVERSATIONS_DIR),
|
|
"title": fm.get("title", md_file.stem),
|
|
"date": fm.get("date", "unknown"),
|
|
"status": status,
|
|
"messages": fm.get("messages", "0"),
|
|
"halls": fm.get("halls", ""),
|
|
"topics": fm.get("topics", ""),
|
|
"project": project_code,
|
|
}
|
|
|
|
by_project[project_code].append(entry)
|
|
|
|
return by_project
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Index generation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def generate_index(by_project: dict[str, list[dict[str, Any]]]) -> str:
|
|
"""Generate the conversations/index.md content."""
|
|
total = sum(len(convos) for convos in by_project.values())
|
|
summarized = sum(
|
|
1
|
|
for convos in by_project.values()
|
|
for c in convos
|
|
if c["status"] == "summarized"
|
|
)
|
|
trivial = sum(
|
|
1
|
|
for convos in by_project.values()
|
|
for c in convos
|
|
if c["status"] == "trivial"
|
|
)
|
|
extracted = total - summarized - trivial
|
|
|
|
lines = [
|
|
"---",
|
|
"title: Conversation Index",
|
|
"type: index",
|
|
f"last_updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d')}",
|
|
"---",
|
|
"",
|
|
"# Conversation Index",
|
|
"",
|
|
f"Mined conversations from Claude Code sessions, organized by project (wing).",
|
|
"",
|
|
f"**{total} conversations** — {summarized} summarized, {extracted} pending, {trivial} trivial.",
|
|
"",
|
|
"---",
|
|
"",
|
|
]
|
|
|
|
for project_code in PROJECT_ORDER:
|
|
convos = by_project.get(project_code, [])
|
|
display_name = PROJECT_NAMES.get(project_code, project_code.upper())
|
|
|
|
lines.append(f"## {display_name}")
|
|
lines.append("")
|
|
|
|
if not convos:
|
|
lines.append("_No conversations mined yet._")
|
|
lines.append("")
|
|
continue
|
|
|
|
# Show summarized first, then extracted, skip trivial from listing
|
|
shown = 0
|
|
for c in convos:
|
|
if c["status"] == "trivial":
|
|
continue
|
|
|
|
status_tag = ""
|
|
if c["status"] == "extracted":
|
|
status_tag = " _(pending summary)_"
|
|
|
|
# Get summary line if summarized
|
|
summary_text = ""
|
|
if c["status"] == "summarized":
|
|
summary_text = f" — {get_summary_line(c['file'])}"
|
|
|
|
lines.append(
|
|
f"- [{c['title']}]({c['relative']})"
|
|
f" ({c['date']}, {c['messages']} msgs)"
|
|
f"{summary_text}{status_tag}"
|
|
)
|
|
shown += 1
|
|
|
|
trivial_count = len(convos) - shown
|
|
if trivial_count > 0:
|
|
lines.append(f"\n_{trivial_count} trivial session(s) not listed._")
|
|
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Context generation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def generate_wakeup(by_project: dict[str, list[dict[str, Any]]]) -> str:
|
|
"""Generate context/wake-up.md from recent conversations."""
|
|
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
|
|
# Determine activity level per project
|
|
project_activity: dict[str, dict[str, Any]] = {}
|
|
for code in PROJECT_ORDER:
|
|
convos = by_project.get(code, [])
|
|
summarized = [c for c in convos if c["status"] == "summarized"]
|
|
|
|
if summarized:
|
|
latest = max(summarized, key=lambda c: c["date"])
|
|
last_date = latest["date"]
|
|
# Simple activity heuristic: sessions in last 7 days = active
|
|
try:
|
|
dt = datetime.strptime(last_date, "%Y-%m-%d")
|
|
days_ago = (datetime.now() - dt).days
|
|
if days_ago <= 7:
|
|
status = "Active"
|
|
elif days_ago <= 30:
|
|
status = "Quiet"
|
|
else:
|
|
status = "Inactive"
|
|
except ValueError:
|
|
status = "Unknown"
|
|
last_date = "—"
|
|
else:
|
|
# Check extracted-only
|
|
if convos:
|
|
latest = max(convos, key=lambda c: c["date"])
|
|
last_date = latest["date"]
|
|
status = "Active" if latest["date"] >= today[:7] else "Quiet"
|
|
else:
|
|
status = "—"
|
|
last_date = "—"
|
|
|
|
project_activity[code] = {
|
|
"status": status,
|
|
"last_date": last_date,
|
|
"count": len(convos),
|
|
}
|
|
|
|
# Gather recent decisions across all projects
|
|
recent_decisions: list[tuple[str, str, str]] = [] # (date, project, decision)
|
|
for code, convos in by_project.items():
|
|
for c in convos:
|
|
if c["status"] != "summarized":
|
|
continue
|
|
for decision in get_decisions(c["file"]):
|
|
recent_decisions.append((c["date"], code, decision))
|
|
|
|
recent_decisions.sort(key=lambda x: x[0], reverse=True)
|
|
recent_decisions = recent_decisions[:10] # Top 10 most recent
|
|
|
|
# Gather recent discoveries
|
|
recent_discoveries: list[tuple[str, str, str]] = []
|
|
for code, convos in by_project.items():
|
|
for c in convos:
|
|
if c["status"] != "summarized":
|
|
continue
|
|
for disc in get_discoveries(c["file"]):
|
|
recent_discoveries.append((c["date"], code, disc))
|
|
|
|
recent_discoveries.sort(key=lambda x: x[0], reverse=True)
|
|
recent_discoveries = recent_discoveries[:5]
|
|
|
|
lines = [
|
|
"---",
|
|
"title: Wake-Up Briefing",
|
|
"type: context",
|
|
f"last_updated: {today}",
|
|
"---",
|
|
"",
|
|
"# Wake-Up Briefing",
|
|
"",
|
|
"Auto-generated world state for AI session context.",
|
|
"",
|
|
"## Active Projects",
|
|
"",
|
|
"| Code | Project | Status | Last Activity | Sessions |",
|
|
"|------|---------|--------|---------------|----------|",
|
|
]
|
|
|
|
for code in PROJECT_ORDER:
|
|
if code == "general":
|
|
continue # Skip general from roster
|
|
info = project_activity.get(code, {"status": "—", "last_date": "—", "count": 0})
|
|
display = PROJECT_NAMES.get(code, code).split(" — ")[1] if " — " in PROJECT_NAMES.get(code, "") else code
|
|
lines.append(
|
|
f"| {code.upper()} | {display} | {info['status']} | {info['last_date']} | {info['count']} |"
|
|
)
|
|
|
|
lines.append("")
|
|
|
|
if recent_decisions:
|
|
lines.append("## Recent Decisions")
|
|
lines.append("")
|
|
for date, proj, decision in recent_decisions[:7]:
|
|
lines.append(f"- **[{proj.upper()}]** {decision} ({date})")
|
|
lines.append("")
|
|
|
|
if recent_discoveries:
|
|
lines.append("## Recent Discoveries")
|
|
lines.append("")
|
|
for date, proj, disc in recent_discoveries[:5]:
|
|
lines.append(f"- **[{proj.upper()}]** {disc} ({date})")
|
|
lines.append("")
|
|
|
|
if not recent_decisions and not recent_discoveries:
|
|
lines.append("## Recent Decisions")
|
|
lines.append("")
|
|
lines.append("_Populated after summarization runs._")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def generate_concerns(by_project: dict[str, list[dict[str, Any]]]) -> str:
|
|
"""Generate context/active-concerns.md from recent conversations."""
|
|
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
|
|
# For now, this is a template that gets populated as summaries accumulate.
|
|
# Future enhancement: parse "blockers", "open questions" from summaries.
|
|
lines = [
|
|
"---",
|
|
"title: Active Concerns",
|
|
"type: context",
|
|
f"last_updated: {today}",
|
|
"---",
|
|
"",
|
|
"# Active Concerns",
|
|
"",
|
|
"Auto-generated from recent conversations. Current blockers, deadlines, and open questions.",
|
|
"",
|
|
]
|
|
|
|
# Count recent activity to give a sense of what's hot
|
|
active_projects: list[tuple[str, int]] = []
|
|
for code in PROJECT_ORDER:
|
|
convos = by_project.get(code, [])
|
|
recent = [c for c in convos if c["date"] >= today[:7]] # This month
|
|
if recent:
|
|
active_projects.append((code, len(recent)))
|
|
|
|
if active_projects:
|
|
active_projects.sort(key=lambda x: x[1], reverse=True)
|
|
lines.append("## Current Focus Areas")
|
|
lines.append("")
|
|
for code, count in active_projects[:5]:
|
|
display = PROJECT_NAMES.get(code, code)
|
|
lines.append(f"- **{display}** — {count} session(s) this month")
|
|
lines.append("")
|
|
|
|
lines.extend([
|
|
"## Blockers",
|
|
"",
|
|
"_Populated from conversation analysis._",
|
|
"",
|
|
"## Open Questions",
|
|
"",
|
|
"_Populated from conversation analysis._",
|
|
"",
|
|
])
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Update conversation index and context files",
|
|
)
|
|
parser.add_argument(
|
|
"--reindex",
|
|
action="store_true",
|
|
help="Also trigger qmd update and embed after updating files",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Discover all conversations
|
|
by_project = discover_conversations()
|
|
|
|
total = sum(len(v) for v in by_project.values())
|
|
print(f"Found {total} conversation(s) across {len(by_project)} projects.")
|
|
|
|
# Generate and write index
|
|
index_content = generate_index(by_project)
|
|
INDEX_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
INDEX_FILE.write_text(index_content)
|
|
print(f"Updated {INDEX_FILE.relative_to(WIKI_DIR)}")
|
|
|
|
# Generate and write context files (create dir if needed)
|
|
WAKEUP_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
wakeup_content = generate_wakeup(by_project)
|
|
WAKEUP_FILE.write_text(wakeup_content)
|
|
print(f"Updated {WAKEUP_FILE.relative_to(WIKI_DIR)}")
|
|
|
|
concerns_content = generate_concerns(by_project)
|
|
CONCERNS_FILE.write_text(concerns_content)
|
|
print(f"Updated {CONCERNS_FILE.relative_to(WIKI_DIR)}")
|
|
|
|
# Optionally trigger qmd reindex
|
|
if args.reindex:
|
|
print("Triggering qmd reindex...")
|
|
try:
|
|
subprocess.run(["qmd", "update"], check=True, capture_output=True)
|
|
subprocess.run(["qmd", "embed"], check=True, capture_output=True)
|
|
print("qmd index updated.")
|
|
except FileNotFoundError:
|
|
print("qmd not found — skipping reindex.", file=sys.stderr)
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"qmd reindex failed: {e}", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|