memex/scripts/wiki_lib.py

"""Shared helpers for wiki maintenance scripts.

Provides frontmatter parsing/serialization, WikiPage dataclass, and common
constants used by wiki-hygiene.py, wiki-staging.py, and wiki-harvest.py.
"""

from __future__ import annotations

import hashlib
import os
import re
from dataclasses import dataclass
from datetime import date, datetime, timezone
from pathlib import Path
from typing import Any

# Wiki root — override via WIKI_DIR env var for tests / alternate installs
WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki")))
INDEX_FILE = WIKI_DIR / "index.md"
STAGING_DIR = WIKI_DIR / "staging"
STAGING_INDEX = STAGING_DIR / "index.md"
ARCHIVE_DIR = WIKI_DIR / "archive"
ARCHIVE_INDEX = ARCHIVE_DIR / "index.md"
REPORTS_DIR = WIKI_DIR / "reports"
CONVERSATIONS_DIR = WIKI_DIR / "conversations"
HARVEST_STATE_FILE = WIKI_DIR / ".harvest-state.json"

LIVE_CONTENT_DIRS = ["patterns", "decisions", "concepts", "environments"]

FM_FENCE = "---\n"


@dataclass
class WikiPage:
    path: Path
    frontmatter: dict[str, Any]
    fm_raw: str
    body: str
    fm_start: int


def today() -> date:
    return datetime.now(timezone.utc).date()


def parse_date(value: Any) -> date | None:
    if not value:
        return None
    if isinstance(value, date):
        return value
    s = str(value).strip()
    try:
        return datetime.strptime(s, "%Y-%m-%d").date()
    except ValueError:
        return None


def parse_page(path: Path) -> WikiPage | None:
    """Parse a markdown page with YAML frontmatter. Returns None if no frontmatter."""
    try:
        text = path.read_text()
    except OSError:
        return None
    if not text.startswith(FM_FENCE):
        return None
    end = text.find("\n---\n", 4)
    if end == -1:
        return None
    fm_raw = text[4:end]
    body = text[end + 5 :]
    fm = parse_yaml_lite(fm_raw)
    return WikiPage(path=path, frontmatter=fm, fm_raw=fm_raw, body=body, fm_start=end + 5)


def parse_yaml_lite(text: str) -> dict[str, Any]:
    """Parse a subset of YAML used in wiki frontmatter.

    Supports:
    - key: value
    - key: [a, b, c]
    - key:
        - a
        - b
    """
    result: dict[str, Any] = {}
    lines = text.splitlines()
    i = 0
    while i < len(lines):
        line = lines[i]
        if not line.strip() or line.lstrip().startswith("#"):
            i += 1
            continue
        m = re.match(r"^([\w_-]+):\s*(.*)$", line)
        if not m:
            i += 1
            continue
        key, rest = m.group(1), m.group(2).strip()
        if rest == "":
            items: list[str] = []
            j = i + 1
            while j < len(lines) and re.match(r"^\s+-\s+", lines[j]):
                items.append(re.sub(r"^\s+-\s+", "", lines[j]).strip())
                j += 1
            if items:
                result[key] = items
                i = j
                continue
            result[key] = ""
            i += 1
            continue
        if rest.startswith("[") and rest.endswith("]"):
            inner = rest[1:-1].strip()
            if inner:
                result[key] = [x.strip().strip('"').strip("'") for x in inner.split(",")]
            else:
                result[key] = []
            i += 1
            continue
        result[key] = rest.strip('"').strip("'")
        i += 1
    return result


# Canonical frontmatter key order for serialization
PREFERRED_KEY_ORDER = [
    "title", "type", "confidence",
    "status", "origin",
    "last_compiled", "last_verified",
    "staged_date", "staged_by", "target_path", "modifies", "compilation_notes",
    "archived_date", "archived_reason", "original_path",
    "sources", "related",
]


def serialize_frontmatter(fm: dict[str, Any]) -> str:
    """Serialize a frontmatter dict back to YAML in the wiki's canonical style."""
    out_lines: list[str] = []
    seen: set[str] = set()
    for key in PREFERRED_KEY_ORDER:
        if key in fm:
            out_lines.append(_format_fm_entry(key, fm[key]))
            seen.add(key)
    for key in sorted(fm.keys()):
        if key in seen:
            continue
        out_lines.append(_format_fm_entry(key, fm[key]))
    return "\n".join(out_lines)


def _format_fm_entry(key: str, value: Any) -> str:
    if isinstance(value, list):
        if not value:
            return f"{key}: []"
        lines = [f"{key}:"]
        for item in value:
            lines.append(f"  - {item}")
        return "\n".join(lines)
    return f"{key}: {value}"


def write_page(page: WikiPage, new_fm: dict[str, Any] | None = None, new_body: str | None = None) -> None:
    fm = new_fm if new_fm is not None else page.frontmatter
    body = new_body if new_body is not None else page.body
    fm_yaml = serialize_frontmatter(fm)
    text = f"---\n{fm_yaml}\n---\n{body}"
    page.path.write_text(text)


def iter_live_pages() -> list[WikiPage]:
    pages: list[WikiPage] = []
    for sub in LIVE_CONTENT_DIRS:
        for md in sorted((WIKI_DIR / sub).glob("*.md")):
            page = parse_page(md)
            if page:
                pages.append(page)
    return pages


def iter_staging_pages() -> list[WikiPage]:
    pages: list[WikiPage] = []
    if not STAGING_DIR.exists():
        return pages
    for sub in LIVE_CONTENT_DIRS:
        d = STAGING_DIR / sub
        if not d.exists():
            continue
        for md in sorted(d.glob("*.md")):
            page = parse_page(md)
            if page:
                pages.append(page)
    return pages


def iter_archived_pages() -> list[WikiPage]:
    pages: list[WikiPage] = []
    if not ARCHIVE_DIR.exists():
        return pages
    for sub in LIVE_CONTENT_DIRS:
        d = ARCHIVE_DIR / sub
        if not d.exists():
            continue
        for md in sorted(d.glob("*.md")):
            page = parse_page(md)
            if page:
                pages.append(page)
    return pages


def page_content_hash(page: WikiPage) -> str:
    """Hash of page body only (excludes frontmatter) so mechanical frontmatter fixes don't churn the hash."""
    return "sha256:" + hashlib.sha256(page.body.strip().encode("utf-8")).hexdigest()


# ---------------------------------------------------------------------------
# Conversation hall parsing
# ---------------------------------------------------------------------------
#
# Summarized conversations have sections in the body like:
#   ## Decisions (hall: fact)
#   - bullet
#   - bullet
#   ## Discoveries (hall: discovery)
#   - bullet
#
# Hall types used by the summarizer: fact, discovery, preference, advice,
# event, tooling. Only fact/discovery/advice are high-signal enough to
# distill into wiki pages; the others are tracked but not auto-promoted.

HIGH_SIGNAL_HALLS = {"fact", "discovery", "advice"}

_HALL_SECTION_RE = re.compile(
    r"^##\s+[^\n]*?\(hall:\s*(\w+)\s*\)\s*$(.*?)(?=^##\s|\Z)",
    re.MULTILINE | re.DOTALL,
)
_BULLET_RE = re.compile(r"^\s*-\s+(.*?)$", re.MULTILINE)


def parse_conversation_halls(page: WikiPage) -> dict[str, list[str]]:
    """Extract hall-bucketed bullet content from a summarized conversation body.

    Returns a dict like:
        {"fact": ["claim one", "claim two"],
         "discovery": ["root cause X"],
         "advice": ["do Y", "consider Z"], ...}

    Empty hall types are omitted. Bullet lines are stripped of leading "- "
    and trailing whitespace; multi-line bullets are joined with a space.
    """
    result: dict[str, list[str]] = {}
    for match in _HALL_SECTION_RE.finditer(page.body):
        hall_type = match.group(1).strip().lower()
        section_body = match.group(2)
        bullets = [
            _flatten_bullet(b.group(1))
            for b in _BULLET_RE.finditer(section_body)
        ]
        bullets = [b for b in bullets if b]
        if bullets:
            result.setdefault(hall_type, []).extend(bullets)
    return result


def _flatten_bullet(text: str) -> str:
    """Collapse a possibly-multiline bullet into a single clean line."""
    return " ".join(text.split()).strip()


def high_signal_halls(page: WikiPage) -> dict[str, list[str]]:
    """Return only fact/discovery/advice content from a conversation."""
    all_halls = parse_conversation_halls(page)
    return {k: v for k, v in all_halls.items() if k in HIGH_SIGNAL_HALLS}