Initial commit — memex

A compounding LLM-maintained knowledge wiki.

Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's
mempalace, with an automation layer on top for conversation mining, URL
harvesting, human-in-the-loop staging, staleness decay, and hygiene.

Includes:
- 11 pipeline scripts (extract, summarize, index, harvest, stage,
  hygiene, maintain, sync, + shared library)
- Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE
- Example CLAUDE.md files (wiki schema + global instructions) tuned for
  the three-collection qmd setup
- 171-test pytest suite (cross-platform, runs in ~1.3s)
- MIT licensed
This commit is contained in:
Eric Turner
2026-04-12 21:16:02 -06:00
commit ee54a2f5d4
31 changed files with 10792 additions and 0 deletions

View File

@@ -0,0 +1,476 @@
#!/usr/bin/env python3
"""Update conversation index and context files from summarized conversations.
Phase C of the conversation mining pipeline. Reads all conversation markdown
files and regenerates:
- conversations/index.md — catalog organized by project
- context/wake-up.md — world briefing from recent conversations
- context/active-concerns.md — current blockers and open threads
Usage:
python3 update-conversation-index.py
python3 update-conversation-index.py --reindex # Also triggers qmd update
"""
from __future__ import annotations
import argparse
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki")))
CONVERSATIONS_DIR = WIKI_DIR / "conversations"
CONTEXT_DIR = WIKI_DIR / "context"
INDEX_FILE = CONVERSATIONS_DIR / "index.md"
WAKEUP_FILE = CONTEXT_DIR / "wake-up.md"
CONCERNS_FILE = CONTEXT_DIR / "active-concerns.md"
# ════════════════════════════════════════════════════════════════════════════
# CONFIGURE ME — Project code to display name mapping
# ════════════════════════════════════════════════════════════════════════════
#
# Every project code you use in `extract-sessions.py`'s PROJECT_MAP should
# have a display name here. The conversation index groups conversations by
# these codes and renders them under sections named by the display name.
#
# Examples — replace with your own:
PROJECT_NAMES: dict[str, str] = {
"wiki": "WIKI — This Wiki",
"cl": "CL — Claude Config",
# "web": "WEB — My Webapp",
# "mob": "MOB — My Mobile App",
# "work": "WORK — Day Job",
"general": "General — Cross-Project",
}
# Order for display — put your most-active projects first
PROJECT_ORDER = [
# "work", "web", "mob",
"wiki", "cl", "general",
]
# ---------------------------------------------------------------------------
# Frontmatter parsing
# ---------------------------------------------------------------------------
def parse_frontmatter(file_path: Path) -> dict[str, str]:
"""Parse YAML frontmatter from a markdown file."""
fm: dict[str, str] = {}
content = file_path.read_text()
# Find frontmatter between --- markers
match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
if not match:
return fm
for line in match.group(1).splitlines():
if ":" in line:
key, _, value = line.partition(":")
fm[key.strip()] = value.strip()
return fm
def get_summary_line(file_path: Path) -> str:
"""Extract the first sentence of the Summary section."""
content = file_path.read_text()
match = re.search(r"## Summary\n\n(.+?)(?:\n\n|\n##)", content, re.DOTALL)
if match:
summary = match.group(1).strip()
# First sentence
first_sentence = summary.split(". ")[0]
if not first_sentence.endswith("."):
first_sentence += "."
# Truncate if too long
if len(first_sentence) > 120:
first_sentence = first_sentence[:117] + "..."
return first_sentence
return "No summary available."
def get_decisions(file_path: Path) -> list[str]:
"""Extract decisions from a conversation file."""
content = file_path.read_text()
decisions: list[str] = []
match = re.search(r"## Decisions.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL)
if match:
for line in match.group(1).strip().splitlines():
line = line.strip()
if line.startswith("- "):
decisions.append(line[2:])
return decisions
def get_discoveries(file_path: Path) -> list[str]:
"""Extract discoveries from a conversation file."""
content = file_path.read_text()
discoveries: list[str] = []
match = re.search(r"## Discoveries.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL)
if match:
for line in match.group(1).strip().splitlines():
line = line.strip()
if line.startswith("- "):
discoveries.append(line[2:])
return discoveries
# ---------------------------------------------------------------------------
# Conversation discovery
# ---------------------------------------------------------------------------
def discover_conversations() -> dict[str, list[dict[str, Any]]]:
"""Discover all conversation files organized by project."""
by_project: dict[str, list[dict[str, Any]]] = defaultdict(list)
for project_dir in sorted(CONVERSATIONS_DIR.iterdir()):
if not project_dir.is_dir():
continue
project_code = project_dir.name
if project_code not in PROJECT_NAMES:
continue
for md_file in sorted(project_dir.glob("*.md"), reverse=True):
if md_file.name == ".gitkeep":
continue
fm = parse_frontmatter(md_file)
status = fm.get("status", "extracted")
entry = {
"file": md_file,
"relative": md_file.relative_to(CONVERSATIONS_DIR),
"title": fm.get("title", md_file.stem),
"date": fm.get("date", "unknown"),
"status": status,
"messages": fm.get("messages", "0"),
"halls": fm.get("halls", ""),
"topics": fm.get("topics", ""),
"project": project_code,
}
by_project[project_code].append(entry)
return by_project
# ---------------------------------------------------------------------------
# Index generation
# ---------------------------------------------------------------------------
def generate_index(by_project: dict[str, list[dict[str, Any]]]) -> str:
"""Generate the conversations/index.md content."""
total = sum(len(convos) for convos in by_project.values())
summarized = sum(
1
for convos in by_project.values()
for c in convos
if c["status"] == "summarized"
)
trivial = sum(
1
for convos in by_project.values()
for c in convos
if c["status"] == "trivial"
)
extracted = total - summarized - trivial
lines = [
"---",
"title: Conversation Index",
"type: index",
f"last_updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d')}",
"---",
"",
"# Conversation Index",
"",
f"Mined conversations from Claude Code sessions, organized by project (wing).",
"",
f"**{total} conversations** — {summarized} summarized, {extracted} pending, {trivial} trivial.",
"",
"---",
"",
]
for project_code in PROJECT_ORDER:
convos = by_project.get(project_code, [])
display_name = PROJECT_NAMES.get(project_code, project_code.upper())
lines.append(f"## {display_name}")
lines.append("")
if not convos:
lines.append("_No conversations mined yet._")
lines.append("")
continue
# Show summarized first, then extracted, skip trivial from listing
shown = 0
for c in convos:
if c["status"] == "trivial":
continue
status_tag = ""
if c["status"] == "extracted":
status_tag = " _(pending summary)_"
# Get summary line if summarized
summary_text = ""
if c["status"] == "summarized":
summary_text = f"{get_summary_line(c['file'])}"
lines.append(
f"- [{c['title']}]({c['relative']})"
f" ({c['date']}, {c['messages']} msgs)"
f"{summary_text}{status_tag}"
)
shown += 1
trivial_count = len(convos) - shown
if trivial_count > 0:
lines.append(f"\n_{trivial_count} trivial session(s) not listed._")
lines.append("")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Context generation
# ---------------------------------------------------------------------------
def generate_wakeup(by_project: dict[str, list[dict[str, Any]]]) -> str:
"""Generate context/wake-up.md from recent conversations."""
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
# Determine activity level per project
project_activity: dict[str, dict[str, Any]] = {}
for code in PROJECT_ORDER:
convos = by_project.get(code, [])
summarized = [c for c in convos if c["status"] == "summarized"]
if summarized:
latest = max(summarized, key=lambda c: c["date"])
last_date = latest["date"]
# Simple activity heuristic: sessions in last 7 days = active
try:
dt = datetime.strptime(last_date, "%Y-%m-%d")
days_ago = (datetime.now() - dt).days
if days_ago <= 7:
status = "Active"
elif days_ago <= 30:
status = "Quiet"
else:
status = "Inactive"
except ValueError:
status = "Unknown"
last_date = ""
else:
# Check extracted-only
if convos:
latest = max(convos, key=lambda c: c["date"])
last_date = latest["date"]
status = "Active" if latest["date"] >= today[:7] else "Quiet"
else:
status = ""
last_date = ""
project_activity[code] = {
"status": status,
"last_date": last_date,
"count": len(convos),
}
# Gather recent decisions across all projects
recent_decisions: list[tuple[str, str, str]] = [] # (date, project, decision)
for code, convos in by_project.items():
for c in convos:
if c["status"] != "summarized":
continue
for decision in get_decisions(c["file"]):
recent_decisions.append((c["date"], code, decision))
recent_decisions.sort(key=lambda x: x[0], reverse=True)
recent_decisions = recent_decisions[:10] # Top 10 most recent
# Gather recent discoveries
recent_discoveries: list[tuple[str, str, str]] = []
for code, convos in by_project.items():
for c in convos:
if c["status"] != "summarized":
continue
for disc in get_discoveries(c["file"]):
recent_discoveries.append((c["date"], code, disc))
recent_discoveries.sort(key=lambda x: x[0], reverse=True)
recent_discoveries = recent_discoveries[:5]
lines = [
"---",
"title: Wake-Up Briefing",
"type: context",
f"last_updated: {today}",
"---",
"",
"# Wake-Up Briefing",
"",
"Auto-generated world state for AI session context.",
"",
"## Active Projects",
"",
"| Code | Project | Status | Last Activity | Sessions |",
"|------|---------|--------|---------------|----------|",
]
for code in PROJECT_ORDER:
if code == "general":
continue # Skip general from roster
info = project_activity.get(code, {"status": "", "last_date": "", "count": 0})
display = PROJECT_NAMES.get(code, code).split("")[1] if "" in PROJECT_NAMES.get(code, "") else code
lines.append(
f"| {code.upper()} | {display} | {info['status']} | {info['last_date']} | {info['count']} |"
)
lines.append("")
if recent_decisions:
lines.append("## Recent Decisions")
lines.append("")
for date, proj, decision in recent_decisions[:7]:
lines.append(f"- **[{proj.upper()}]** {decision} ({date})")
lines.append("")
if recent_discoveries:
lines.append("## Recent Discoveries")
lines.append("")
for date, proj, disc in recent_discoveries[:5]:
lines.append(f"- **[{proj.upper()}]** {disc} ({date})")
lines.append("")
if not recent_decisions and not recent_discoveries:
lines.append("## Recent Decisions")
lines.append("")
lines.append("_Populated after summarization runs._")
lines.append("")
return "\n".join(lines)
def generate_concerns(by_project: dict[str, list[dict[str, Any]]]) -> str:
"""Generate context/active-concerns.md from recent conversations."""
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
# For now, this is a template that gets populated as summaries accumulate.
# Future enhancement: parse "blockers", "open questions" from summaries.
lines = [
"---",
"title: Active Concerns",
"type: context",
f"last_updated: {today}",
"---",
"",
"# Active Concerns",
"",
"Auto-generated from recent conversations. Current blockers, deadlines, and open questions.",
"",
]
# Count recent activity to give a sense of what's hot
active_projects: list[tuple[str, int]] = []
for code in PROJECT_ORDER:
convos = by_project.get(code, [])
recent = [c for c in convos if c["date"] >= today[:7]] # This month
if recent:
active_projects.append((code, len(recent)))
if active_projects:
active_projects.sort(key=lambda x: x[1], reverse=True)
lines.append("## Current Focus Areas")
lines.append("")
for code, count in active_projects[:5]:
display = PROJECT_NAMES.get(code, code)
lines.append(f"- **{display}** — {count} session(s) this month")
lines.append("")
lines.extend([
"## Blockers",
"",
"_Populated from conversation analysis._",
"",
"## Open Questions",
"",
"_Populated from conversation analysis._",
"",
])
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Update conversation index and context files",
)
parser.add_argument(
"--reindex",
action="store_true",
help="Also trigger qmd update and embed after updating files",
)
args = parser.parse_args()
# Discover all conversations
by_project = discover_conversations()
total = sum(len(v) for v in by_project.values())
print(f"Found {total} conversation(s) across {len(by_project)} projects.")
# Generate and write index
index_content = generate_index(by_project)
INDEX_FILE.parent.mkdir(parents=True, exist_ok=True)
INDEX_FILE.write_text(index_content)
print(f"Updated {INDEX_FILE.relative_to(WIKI_DIR)}")
# Generate and write context files (create dir if needed)
WAKEUP_FILE.parent.mkdir(parents=True, exist_ok=True)
wakeup_content = generate_wakeup(by_project)
WAKEUP_FILE.write_text(wakeup_content)
print(f"Updated {WAKEUP_FILE.relative_to(WIKI_DIR)}")
concerns_content = generate_concerns(by_project)
CONCERNS_FILE.write_text(concerns_content)
print(f"Updated {CONCERNS_FILE.relative_to(WIKI_DIR)}")
# Optionally trigger qmd reindex
if args.reindex:
print("Triggering qmd reindex...")
try:
subprocess.run(["qmd", "update"], check=True, capture_output=True)
subprocess.run(["qmd", "embed"], check=True, capture_output=True)
print("qmd index updated.")
except FileNotFoundError:
print("qmd not found — skipping reindex.", file=sys.stderr)
except subprocess.CalledProcessError as e:
print(f"qmd reindex failed: {e}", file=sys.stderr)
if __name__ == "__main__":
main()