"""Shared helpers for wiki maintenance scripts. Provides frontmatter parsing/serialization, WikiPage dataclass, and common constants used by wiki-hygiene.py, wiki-staging.py, and wiki-harvest.py. """ from __future__ import annotations import hashlib import os import re from dataclasses import dataclass from datetime import date, datetime, timezone from pathlib import Path from typing import Any # Wiki root — override via WIKI_DIR env var for tests / alternate installs WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki"))) INDEX_FILE = WIKI_DIR / "index.md" STAGING_DIR = WIKI_DIR / "staging" STAGING_INDEX = STAGING_DIR / "index.md" ARCHIVE_DIR = WIKI_DIR / "archive" ARCHIVE_INDEX = ARCHIVE_DIR / "index.md" REPORTS_DIR = WIKI_DIR / "reports" CONVERSATIONS_DIR = WIKI_DIR / "conversations" HARVEST_STATE_FILE = WIKI_DIR / ".harvest-state.json" LIVE_CONTENT_DIRS = ["patterns", "decisions", "concepts", "environments"] FM_FENCE = "---\n" @dataclass class WikiPage: path: Path frontmatter: dict[str, Any] fm_raw: str body: str fm_start: int def today() -> date: return datetime.now(timezone.utc).date() def parse_date(value: Any) -> date | None: if not value: return None if isinstance(value, date): return value s = str(value).strip() try: return datetime.strptime(s, "%Y-%m-%d").date() except ValueError: return None def parse_page(path: Path) -> WikiPage | None: """Parse a markdown page with YAML frontmatter. Returns None if no frontmatter.""" try: text = path.read_text() except OSError: return None if not text.startswith(FM_FENCE): return None end = text.find("\n---\n", 4) if end == -1: return None fm_raw = text[4:end] body = text[end + 5 :] fm = parse_yaml_lite(fm_raw) return WikiPage(path=path, frontmatter=fm, fm_raw=fm_raw, body=body, fm_start=end + 5) def parse_yaml_lite(text: str) -> dict[str, Any]: """Parse a subset of YAML used in wiki frontmatter. Supports: - key: value - key: [a, b, c] - key: - a - b """ result: dict[str, Any] = {} lines = text.splitlines() i = 0 while i < len(lines): line = lines[i] if not line.strip() or line.lstrip().startswith("#"): i += 1 continue m = re.match(r"^([\w_-]+):\s*(.*)$", line) if not m: i += 1 continue key, rest = m.group(1), m.group(2).strip() if rest == "": items: list[str] = [] j = i + 1 while j < len(lines) and re.match(r"^\s+-\s+", lines[j]): items.append(re.sub(r"^\s+-\s+", "", lines[j]).strip()) j += 1 if items: result[key] = items i = j continue result[key] = "" i += 1 continue if rest.startswith("[") and rest.endswith("]"): inner = rest[1:-1].strip() if inner: result[key] = [x.strip().strip('"').strip("'") for x in inner.split(",")] else: result[key] = [] i += 1 continue result[key] = rest.strip('"').strip("'") i += 1 return result # Canonical frontmatter key order for serialization PREFERRED_KEY_ORDER = [ "title", "type", "confidence", "status", "origin", "last_compiled", "last_verified", "staged_date", "staged_by", "target_path", "modifies", "compilation_notes", "archived_date", "archived_reason", "original_path", "sources", "related", ] def serialize_frontmatter(fm: dict[str, Any]) -> str: """Serialize a frontmatter dict back to YAML in the wiki's canonical style.""" out_lines: list[str] = [] seen: set[str] = set() for key in PREFERRED_KEY_ORDER: if key in fm: out_lines.append(_format_fm_entry(key, fm[key])) seen.add(key) for key in sorted(fm.keys()): if key in seen: continue out_lines.append(_format_fm_entry(key, fm[key])) return "\n".join(out_lines) def _format_fm_entry(key: str, value: Any) -> str: if isinstance(value, list): if not value: return f"{key}: []" lines = [f"{key}:"] for item in value: lines.append(f" - {item}") return "\n".join(lines) return f"{key}: {value}" def write_page(page: WikiPage, new_fm: dict[str, Any] | None = None, new_body: str | None = None) -> None: fm = new_fm if new_fm is not None else page.frontmatter body = new_body if new_body is not None else page.body fm_yaml = serialize_frontmatter(fm) text = f"---\n{fm_yaml}\n---\n{body}" page.path.write_text(text) def iter_live_pages() -> list[WikiPage]: pages: list[WikiPage] = [] for sub in LIVE_CONTENT_DIRS: for md in sorted((WIKI_DIR / sub).glob("*.md")): page = parse_page(md) if page: pages.append(page) return pages def iter_staging_pages() -> list[WikiPage]: pages: list[WikiPage] = [] if not STAGING_DIR.exists(): return pages for sub in LIVE_CONTENT_DIRS: d = STAGING_DIR / sub if not d.exists(): continue for md in sorted(d.glob("*.md")): page = parse_page(md) if page: pages.append(page) return pages def iter_archived_pages() -> list[WikiPage]: pages: list[WikiPage] = [] if not ARCHIVE_DIR.exists(): return pages for sub in LIVE_CONTENT_DIRS: d = ARCHIVE_DIR / sub if not d.exists(): continue for md in sorted(d.glob("*.md")): page = parse_page(md) if page: pages.append(page) return pages def page_content_hash(page: WikiPage) -> str: """Hash of page body only (excludes frontmatter) so mechanical frontmatter fixes don't churn the hash.""" return "sha256:" + hashlib.sha256(page.body.strip().encode("utf-8")).hexdigest()