Initial commit — memex

A compounding LLM-maintained knowledge wiki. Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's mempalace, with an automation layer on top for conversation mining, URL harvesting, human-in-the-loop staging, staleness decay, and hygiene. Includes: - 11 pipeline scripts (extract, summarize, index, harvest, stage, hygiene, maintain, sync, + shared library) - Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE - Example CLAUDE.md files (wiki schema + global instructions) tuned for the three-collection qmd setup - 171-test pytest suite (cross-platform, runs in ~1.3s) - MIT licensed
2026-04-12 21:16:02 -06:00
commit ee54a2f5d4
31 changed files with 10792 additions and 0 deletions
--- a/scripts/wiki_lib.py
+++ b/scripts/wiki_lib.py
@@ -0,0 +1,211 @@
+"""Shared helpers for wiki maintenance scripts.
+
+Provides frontmatter parsing/serialization, WikiPage dataclass, and common
+constants used by wiki-hygiene.py, wiki-staging.py, and wiki-harvest.py.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import os
+import re
+from dataclasses import dataclass
+from datetime import date, datetime, timezone
+from pathlib import Path
+from typing import Any
+
+# Wiki root — override via WIKI_DIR env var for tests / alternate installs
+WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki")))
+INDEX_FILE = WIKI_DIR / "index.md"
+STAGING_DIR = WIKI_DIR / "staging"
+STAGING_INDEX = STAGING_DIR / "index.md"
+ARCHIVE_DIR = WIKI_DIR / "archive"
+ARCHIVE_INDEX = ARCHIVE_DIR / "index.md"
+REPORTS_DIR = WIKI_DIR / "reports"
+CONVERSATIONS_DIR = WIKI_DIR / "conversations"
+HARVEST_STATE_FILE = WIKI_DIR / ".harvest-state.json"
+
+LIVE_CONTENT_DIRS = ["patterns", "decisions", "concepts", "environments"]
+
+FM_FENCE = "---\n"
+
+
+@dataclass
+class WikiPage:
+    path: Path
+    frontmatter: dict[str, Any]
+    fm_raw: str
+    body: str
+    fm_start: int
+
+
+def today() -> date:
+    return datetime.now(timezone.utc).date()
+
+
+def parse_date(value: Any) -> date | None:
+    if not value:
+        return None
+    if isinstance(value, date):
+        return value
+    s = str(value).strip()
+    try:
+        return datetime.strptime(s, "%Y-%m-%d").date()
+    except ValueError:
+        return None
+
+
+def parse_page(path: Path) -> WikiPage | None:
+    """Parse a markdown page with YAML frontmatter. Returns None if no frontmatter."""
+    try:
+        text = path.read_text()
+    except OSError:
+        return None
+    if not text.startswith(FM_FENCE):
+        return None
+    end = text.find("\n---\n", 4)
+    if end == -1:
+        return None
+    fm_raw = text[4:end]
+    body = text[end + 5 :]
+    fm = parse_yaml_lite(fm_raw)
+    return WikiPage(path=path, frontmatter=fm, fm_raw=fm_raw, body=body, fm_start=end + 5)
+
+
+def parse_yaml_lite(text: str) -> dict[str, Any]:
+    """Parse a subset of YAML used in wiki frontmatter.
+
+    Supports:
+    - key: value
+    - key: [a, b, c]
+    - key:
+        - a
+        - b
+    """
+    result: dict[str, Any] = {}
+    lines = text.splitlines()
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if not line.strip() or line.lstrip().startswith("#"):
+            i += 1
+            continue
+        m = re.match(r"^([\w_-]+):\s*(.*)$", line)
+        if not m:
+            i += 1
+            continue
+        key, rest = m.group(1), m.group(2).strip()
+        if rest == "":
+            items: list[str] = []
+            j = i + 1
+            while j < len(lines) and re.match(r"^\s+-\s+", lines[j]):
+                items.append(re.sub(r"^\s+-\s+", "", lines[j]).strip())
+                j += 1
+            if items:
+                result[key] = items
+                i = j
+                continue
+            result[key] = ""
+            i += 1
+            continue
+        if rest.startswith("[") and rest.endswith("]"):
+            inner = rest[1:-1].strip()
+            if inner:
+                result[key] = [x.strip().strip('"').strip("'") for x in inner.split(",")]
+            else:
+                result[key] = []
+            i += 1
+            continue
+        result[key] = rest.strip('"').strip("'")
+        i += 1
+    return result
+
+
+# Canonical frontmatter key order for serialization
+PREFERRED_KEY_ORDER = [
+    "title", "type", "confidence",
+    "status", "origin",
+    "last_compiled", "last_verified",
+    "staged_date", "staged_by", "target_path", "modifies", "compilation_notes",
+    "archived_date", "archived_reason", "original_path",
+    "sources", "related",
+]
+
+
+def serialize_frontmatter(fm: dict[str, Any]) -> str:
+    """Serialize a frontmatter dict back to YAML in the wiki's canonical style."""
+    out_lines: list[str] = []
+    seen: set[str] = set()
+    for key in PREFERRED_KEY_ORDER:
+        if key in fm:
+            out_lines.append(_format_fm_entry(key, fm[key]))
+            seen.add(key)
+    for key in sorted(fm.keys()):
+        if key in seen:
+            continue
+        out_lines.append(_format_fm_entry(key, fm[key]))
+    return "\n".join(out_lines)
+
+
+def _format_fm_entry(key: str, value: Any) -> str:
+    if isinstance(value, list):
+        if not value:
+            return f"{key}: []"
+        lines = [f"{key}:"]
+        for item in value:
+            lines.append(f"  - {item}")
+        return "\n".join(lines)
+    return f"{key}: {value}"
+
+
+def write_page(page: WikiPage, new_fm: dict[str, Any] | None = None, new_body: str | None = None) -> None:
+    fm = new_fm if new_fm is not None else page.frontmatter
+    body = new_body if new_body is not None else page.body
+    fm_yaml = serialize_frontmatter(fm)
+    text = f"---\n{fm_yaml}\n---\n{body}"
+    page.path.write_text(text)
+
+
+def iter_live_pages() -> list[WikiPage]:
+    pages: list[WikiPage] = []
+    for sub in LIVE_CONTENT_DIRS:
+        for md in sorted((WIKI_DIR / sub).glob("*.md")):
+            page = parse_page(md)
+            if page:
+                pages.append(page)
+    return pages
+
+
+def iter_staging_pages() -> list[WikiPage]:
+    pages: list[WikiPage] = []
+    if not STAGING_DIR.exists():
+        return pages
+    for sub in LIVE_CONTENT_DIRS:
+        d = STAGING_DIR / sub
+        if not d.exists():
+            continue
+        for md in sorted(d.glob("*.md")):
+            page = parse_page(md)
+            if page:
+                pages.append(page)
+    return pages
+
+
+def iter_archived_pages() -> list[WikiPage]:
+    pages: list[WikiPage] = []
+    if not ARCHIVE_DIR.exists():
+        return pages
+    for sub in LIVE_CONTENT_DIRS:
+        d = ARCHIVE_DIR / sub
+        if not d.exists():
+            continue
+        for md in sorted(d.glob("*.md")):
+            page = parse_page(md)
+            if page:
+                pages.append(page)
+    return pages
+
+
+def page_content_hash(page: WikiPage) -> str:
+    """Hash of page body only (excludes frontmatter) so mechanical frontmatter fixes don't churn the hash."""
+    return "sha256:" + hashlib.sha256(page.body.strip().encode("utf-8")).hexdigest()