Initial commit — memex
A compounding LLM-maintained knowledge wiki. Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's mempalace, with an automation layer on top for conversation mining, URL harvesting, human-in-the-loop staging, staleness decay, and hygiene. Includes: - 11 pipeline scripts (extract, summarize, index, harvest, stage, hygiene, maintain, sync, + shared library) - Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE - Example CLAUDE.md files (wiki schema + global instructions) tuned for the three-collection qmd setup - 171-test pytest suite (cross-platform, runs in ~1.3s) - MIT licensed
This commit is contained in:
211
scripts/wiki_lib.py
Normal file
211
scripts/wiki_lib.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""Shared helpers for wiki maintenance scripts.
|
||||
|
||||
Provides frontmatter parsing/serialization, WikiPage dataclass, and common
|
||||
constants used by wiki-hygiene.py, wiki-staging.py, and wiki-harvest.py.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Wiki root — override via WIKI_DIR env var for tests / alternate installs
|
||||
WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki")))
|
||||
INDEX_FILE = WIKI_DIR / "index.md"
|
||||
STAGING_DIR = WIKI_DIR / "staging"
|
||||
STAGING_INDEX = STAGING_DIR / "index.md"
|
||||
ARCHIVE_DIR = WIKI_DIR / "archive"
|
||||
ARCHIVE_INDEX = ARCHIVE_DIR / "index.md"
|
||||
REPORTS_DIR = WIKI_DIR / "reports"
|
||||
CONVERSATIONS_DIR = WIKI_DIR / "conversations"
|
||||
HARVEST_STATE_FILE = WIKI_DIR / ".harvest-state.json"
|
||||
|
||||
LIVE_CONTENT_DIRS = ["patterns", "decisions", "concepts", "environments"]
|
||||
|
||||
FM_FENCE = "---\n"
|
||||
|
||||
|
||||
@dataclass
|
||||
class WikiPage:
|
||||
path: Path
|
||||
frontmatter: dict[str, Any]
|
||||
fm_raw: str
|
||||
body: str
|
||||
fm_start: int
|
||||
|
||||
|
||||
def today() -> date:
|
||||
return datetime.now(timezone.utc).date()
|
||||
|
||||
|
||||
def parse_date(value: Any) -> date | None:
|
||||
if not value:
|
||||
return None
|
||||
if isinstance(value, date):
|
||||
return value
|
||||
s = str(value).strip()
|
||||
try:
|
||||
return datetime.strptime(s, "%Y-%m-%d").date()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def parse_page(path: Path) -> WikiPage | None:
|
||||
"""Parse a markdown page with YAML frontmatter. Returns None if no frontmatter."""
|
||||
try:
|
||||
text = path.read_text()
|
||||
except OSError:
|
||||
return None
|
||||
if not text.startswith(FM_FENCE):
|
||||
return None
|
||||
end = text.find("\n---\n", 4)
|
||||
if end == -1:
|
||||
return None
|
||||
fm_raw = text[4:end]
|
||||
body = text[end + 5 :]
|
||||
fm = parse_yaml_lite(fm_raw)
|
||||
return WikiPage(path=path, frontmatter=fm, fm_raw=fm_raw, body=body, fm_start=end + 5)
|
||||
|
||||
|
||||
def parse_yaml_lite(text: str) -> dict[str, Any]:
|
||||
"""Parse a subset of YAML used in wiki frontmatter.
|
||||
|
||||
Supports:
|
||||
- key: value
|
||||
- key: [a, b, c]
|
||||
- key:
|
||||
- a
|
||||
- b
|
||||
"""
|
||||
result: dict[str, Any] = {}
|
||||
lines = text.splitlines()
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
if not line.strip() or line.lstrip().startswith("#"):
|
||||
i += 1
|
||||
continue
|
||||
m = re.match(r"^([\w_-]+):\s*(.*)$", line)
|
||||
if not m:
|
||||
i += 1
|
||||
continue
|
||||
key, rest = m.group(1), m.group(2).strip()
|
||||
if rest == "":
|
||||
items: list[str] = []
|
||||
j = i + 1
|
||||
while j < len(lines) and re.match(r"^\s+-\s+", lines[j]):
|
||||
items.append(re.sub(r"^\s+-\s+", "", lines[j]).strip())
|
||||
j += 1
|
||||
if items:
|
||||
result[key] = items
|
||||
i = j
|
||||
continue
|
||||
result[key] = ""
|
||||
i += 1
|
||||
continue
|
||||
if rest.startswith("[") and rest.endswith("]"):
|
||||
inner = rest[1:-1].strip()
|
||||
if inner:
|
||||
result[key] = [x.strip().strip('"').strip("'") for x in inner.split(",")]
|
||||
else:
|
||||
result[key] = []
|
||||
i += 1
|
||||
continue
|
||||
result[key] = rest.strip('"').strip("'")
|
||||
i += 1
|
||||
return result
|
||||
|
||||
|
||||
# Canonical frontmatter key order for serialization
|
||||
PREFERRED_KEY_ORDER = [
|
||||
"title", "type", "confidence",
|
||||
"status", "origin",
|
||||
"last_compiled", "last_verified",
|
||||
"staged_date", "staged_by", "target_path", "modifies", "compilation_notes",
|
||||
"archived_date", "archived_reason", "original_path",
|
||||
"sources", "related",
|
||||
]
|
||||
|
||||
|
||||
def serialize_frontmatter(fm: dict[str, Any]) -> str:
|
||||
"""Serialize a frontmatter dict back to YAML in the wiki's canonical style."""
|
||||
out_lines: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for key in PREFERRED_KEY_ORDER:
|
||||
if key in fm:
|
||||
out_lines.append(_format_fm_entry(key, fm[key]))
|
||||
seen.add(key)
|
||||
for key in sorted(fm.keys()):
|
||||
if key in seen:
|
||||
continue
|
||||
out_lines.append(_format_fm_entry(key, fm[key]))
|
||||
return "\n".join(out_lines)
|
||||
|
||||
|
||||
def _format_fm_entry(key: str, value: Any) -> str:
|
||||
if isinstance(value, list):
|
||||
if not value:
|
||||
return f"{key}: []"
|
||||
lines = [f"{key}:"]
|
||||
for item in value:
|
||||
lines.append(f" - {item}")
|
||||
return "\n".join(lines)
|
||||
return f"{key}: {value}"
|
||||
|
||||
|
||||
def write_page(page: WikiPage, new_fm: dict[str, Any] | None = None, new_body: str | None = None) -> None:
|
||||
fm = new_fm if new_fm is not None else page.frontmatter
|
||||
body = new_body if new_body is not None else page.body
|
||||
fm_yaml = serialize_frontmatter(fm)
|
||||
text = f"---\n{fm_yaml}\n---\n{body}"
|
||||
page.path.write_text(text)
|
||||
|
||||
|
||||
def iter_live_pages() -> list[WikiPage]:
|
||||
pages: list[WikiPage] = []
|
||||
for sub in LIVE_CONTENT_DIRS:
|
||||
for md in sorted((WIKI_DIR / sub).glob("*.md")):
|
||||
page = parse_page(md)
|
||||
if page:
|
||||
pages.append(page)
|
||||
return pages
|
||||
|
||||
|
||||
def iter_staging_pages() -> list[WikiPage]:
|
||||
pages: list[WikiPage] = []
|
||||
if not STAGING_DIR.exists():
|
||||
return pages
|
||||
for sub in LIVE_CONTENT_DIRS:
|
||||
d = STAGING_DIR / sub
|
||||
if not d.exists():
|
||||
continue
|
||||
for md in sorted(d.glob("*.md")):
|
||||
page = parse_page(md)
|
||||
if page:
|
||||
pages.append(page)
|
||||
return pages
|
||||
|
||||
|
||||
def iter_archived_pages() -> list[WikiPage]:
|
||||
pages: list[WikiPage] = []
|
||||
if not ARCHIVE_DIR.exists():
|
||||
return pages
|
||||
for sub in LIVE_CONTENT_DIRS:
|
||||
d = ARCHIVE_DIR / sub
|
||||
if not d.exists():
|
||||
continue
|
||||
for md in sorted(d.glob("*.md")):
|
||||
page = parse_page(md)
|
||||
if page:
|
||||
pages.append(page)
|
||||
return pages
|
||||
|
||||
|
||||
def page_content_hash(page: WikiPage) -> str:
|
||||
"""Hash of page body only (excludes frontmatter) so mechanical frontmatter fixes don't churn the hash."""
|
||||
return "sha256:" + hashlib.sha256(page.body.strip().encode("utf-8")).hexdigest()
|
||||
Reference in New Issue
Block a user