From ee54a2f5d4a056b49dd7b2d0465697dd3f0f757d Mon Sep 17 00:00:00 2001 From: Eric Turner Date: Sun, 12 Apr 2026 21:16:02 -0600 Subject: [PATCH] =?UTF-8?q?Initial=20commit=20=E2=80=94=20memex?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A compounding LLM-maintained knowledge wiki. Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's mempalace, with an automation layer on top for conversation mining, URL harvesting, human-in-the-loop staging, staleness decay, and hygiene. Includes: - 11 pipeline scripts (extract, summarize, index, harvest, stage, hygiene, maintain, sync, + shared library) - Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE - Example CLAUDE.md files (wiki schema + global instructions) tuned for the three-collection qmd setup - 171-test pytest suite (cross-platform, runs in ~1.3s) - MIT licensed --- .gitignore | 35 + LICENSE | 21 + README.md | 421 +++++++ config.example.yaml | 114 ++ docs/ARCHITECTURE.md | 360 ++++++ docs/CUSTOMIZE.md | 432 +++++++ docs/DESIGN-RATIONALE.md | 338 ++++++ docs/SETUP.md | 502 ++++++++ docs/examples/global-CLAUDE.md | 161 +++ docs/examples/wiki-CLAUDE.md | 278 +++++ scripts/extract-sessions.py | 810 +++++++++++++ scripts/mine-conversations.sh | 118 ++ scripts/mine-prompt-v2.md | 40 + scripts/summarize-conversations.py | 646 +++++++++++ scripts/update-conversation-index.py | 476 ++++++++ scripts/wiki-harvest.py | 878 ++++++++++++++ scripts/wiki-hygiene.py | 1587 ++++++++++++++++++++++++++ scripts/wiki-maintain.sh | 198 ++++ scripts/wiki-staging.py | 639 +++++++++++ scripts/wiki-sync.sh | 230 ++++ scripts/wiki_lib.py | 211 ++++ tests/README.md | 107 ++ tests/conftest.py | 300 +++++ tests/pytest.ini | 9 + tests/run.sh | 31 + tests/test_conversation_pipeline.py | 121 ++ tests/test_shell_scripts.py | 209 ++++ tests/test_wiki_harvest.py | 323 ++++++ tests/test_wiki_hygiene.py | 616 ++++++++++ tests/test_wiki_lib.py | 314 +++++ tests/test_wiki_staging.py | 267 +++++ 31 files changed, 10792 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 config.example.yaml create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/CUSTOMIZE.md create mode 100644 docs/DESIGN-RATIONALE.md create mode 100644 docs/SETUP.md create mode 100644 docs/examples/global-CLAUDE.md create mode 100644 docs/examples/wiki-CLAUDE.md create mode 100755 scripts/extract-sessions.py create mode 100755 scripts/mine-conversations.sh create mode 100644 scripts/mine-prompt-v2.md create mode 100755 scripts/summarize-conversations.py create mode 100755 scripts/update-conversation-index.py create mode 100755 scripts/wiki-harvest.py create mode 100755 scripts/wiki-hygiene.py create mode 100755 scripts/wiki-maintain.sh create mode 100755 scripts/wiki-staging.py create mode 100755 scripts/wiki-sync.sh create mode 100644 scripts/wiki_lib.py create mode 100644 tests/README.md create mode 100644 tests/conftest.py create mode 100644 tests/pytest.ini create mode 100755 tests/run.sh create mode 100644 tests/test_conversation_pipeline.py create mode 100644 tests/test_shell_scripts.py create mode 100644 tests/test_wiki_harvest.py create mode 100644 tests/test_wiki_hygiene.py create mode 100644 tests/test_wiki_lib.py create mode 100644 tests/test_wiki_staging.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9e48a58 --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +# Conversation extraction state — per-machine byte offsets, not portable +.mine-state.json + +# Log files from the mining and maintenance pipelines +scripts/.mine.log +scripts/.maintain.log +scripts/.sync.log +scripts/.summarize-claude.log +scripts/.summarize-claude-retry.log + +# Python bytecode and cache +__pycache__/ +*.py[cod] +*$py.class +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ + +# Editor / OS noise +.DS_Store +.vscode/ +.idea/ +*.swp +*~ + +# Obsidian workspace state (keep the `.obsidian/` config if you use it, +# ignore only the ephemeral bits) +.obsidian/workspace.json +.obsidian/workspace-mobile.json +.obsidian/hotkeys.json + +# NOTE: the following state files are NOT gitignored — they must sync +# across machines so both installs agree on what's been processed: +# .harvest-state.json (URL dedup) +# .hygiene-state.json (content hashes, deferred issues) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6be6959 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Eric Turner + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..dad142e --- /dev/null +++ b/README.md @@ -0,0 +1,421 @@ +# LLM Wiki — Compounding Knowledge for AI Agents + +A persistent, LLM-maintained knowledge base that sits between you and the +sources it was compiled from. Unlike RAG — which re-discovers the same +answers on every query — the wiki **gets richer over time**. Facts get +cross-referenced, contradictions get flagged, stale advice ages out and +gets archived, and new knowledge discovered during a session gets written +back so it's there next time. + +The agent reads the wiki at the start of every session and updates it as +new things are learned. The wiki is the long-term memory; the session is +the working memory. + +> **Inspiration**: this combines the ideas from +> [Andrej Karpathy's persistent-wiki gist](https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f) +> and [milla-jovovich/mempalace](https://github.com/milla-jovovich/mempalace), +> and adds an automation layer on top so the wiki maintains itself. + +--- + +## The problem with stateless RAG + +Most people's experience with LLMs and documents looks like RAG: you upload +files, the LLM retrieves chunks at query time, generates an answer, done. +This works — but the LLM is rediscovering knowledge from scratch on every +question. There's no accumulation. + +Ask the same subtle question twice and the LLM does all the same work twice. +Ask something that requires synthesizing five documents and the LLM has to +find and piece together the relevant fragments every time. Nothing is built +up. NotebookLM, ChatGPT file uploads, and most RAG systems work this way. + +Worse, raw sources go stale. URLs rot. Documentation lags. Blog posts +get retracted. If your knowledge base is "the original documents," +stale advice keeps showing up alongside current advice and there's no way +to know which is which. + +## The core idea — a compounding wiki + +Instead of retrieving from raw documents at query time, the LLM +**incrementally builds and maintains a persistent wiki** — a structured, +interlinked collection of markdown files that sits between you and the +raw sources. + +When a new source shows up (a doc page, a blog post, a CLI `--help`, a +conversation transcript), the LLM doesn't just index it. It reads it, +extracts what's load-bearing, and integrates it into the existing wiki — +updating topic pages, revising summaries, noting where new data +contradicts old claims, strengthening or challenging the evolving +synthesis. The knowledge is compiled once and then *kept current*, not +re-derived on every query. + +This is the key difference: **the wiki is a persistent, compounding +artifact.** The cross-references are already there. The contradictions have +already been flagged. The synthesis already reflects everything the LLM +has read. The wiki gets richer with every source added and every question +asked. + +You never (or rarely) write the wiki yourself. The LLM writes and maintains +all of it. You're in charge of sourcing, exploration, and asking the right +questions. The LLM does the summarizing, cross-referencing, filing, and +bookkeeping that make a knowledge base actually useful over time. + +--- + +## What this adds beyond Karpathy's gist + +Karpathy's gist describes the *idea* — a wiki the agent maintains. This +repo is a working implementation with an automation layer that handles the +lifecycle of knowledge, not just its creation: + +| Layer | What it does | +|-------|--------------| +| **Conversation mining** | Extracts Claude Code session transcripts into searchable markdown. Summarizes them via `claude -p` with model routing (haiku for short sessions, sonnet for long ones). Links summaries to wiki pages by topic. | +| **URL harvesting** | Scans summarized conversations for external reference URLs. Fetches them via `trafilatura` → `crawl4ai` → stealth mode cascade. Compiles clean markdown into pending wiki pages. | +| **Human-in-the-loop staging** | Automated content lands in `staging/` with `status: pending`. You review via CLI, interactive prompts, or an in-session Claude review. Nothing automated goes live without approval. | +| **Staleness decay** | Every page tracks `last_verified`. After 6 months without a refresh signal, confidence decays `high → medium`; 9 months → `low`; 12 months → `stale` → auto-archived. | +| **Auto-restoration** | Archived pages that get referenced again in new conversations or wiki updates are automatically restored. | +| **Hygiene** | Daily structural checks (orphans, broken cross-refs, index drift, frontmatter repair). Weekly LLM-powered checks (duplicates, contradictions, missing cross-references). | +| **Orchestrator** | One script chains all of the above into a daily cron-able pipeline. | + +The result: you don't have to maintain the wiki. You just *use* it. The +automation handles harvesting new knowledge, retiring old knowledge, +keeping cross-references intact, and flagging ambiguity for review. + +--- + +## Why each part exists + +Before implementing anything, the design was worked out interactively +with Claude as a [Signal & Noise analysis of Karpathy's +pattern](https://claude.ai/public/artifacts/0f6e1d9b-3b8c-43df-99d7-3a4328a1620c). +That analysis found seven real weaknesses in the core pattern. This +repo exists because each weakness has a concrete mitigation — and +every component maps directly to one: + +| Karpathy-pattern weakness | How this repo answers it | +|---------------------------|--------------------------| +| **Errors persist and compound** | `confidence` field with time-based decay → pages age out visibly. Staging review catches automated content before it goes live. Full-mode hygiene does LLM contradiction detection. | +| **Hard ~50K-token ceiling** | `qmd` (BM25 + vector + re-ranking) set up from day one. Wing/room structural filtering narrows search before retrieval. Archive collection is excluded from default search. | +| **Manual cross-checking returns** | Every wiki claim traces back to immutable `raw/harvested/*.md` with SHA-256 hash. Staging review IS the cross-check. `compilation_notes` field makes review fast. | +| **Knowledge staleness** (the #1 failure mode in community data) | Daily + weekly cron removes "I forgot" as a failure mode. `last_verified` auto-refreshes from conversation references. Decayed pages auto-archive. | +| **Cognitive outsourcing risk** | Staging review forces engagement with every automated page. `qmd query` makes retrieval an active exploration. Wake-up briefing ~200 tokens the human reads too. | +| **Weaker semantic retrieval** | `qmd` hybrid (BM25 + vector). Full-mode hygiene adds missing cross-references. Structural metadata (wings, rooms) complements semantic search. | +| **No access control** | Git sync with `merge=union` markdown handling. Network-boundary ACL via Tailscale is the suggested path. *This one is a residual trade-off — see [DESIGN-RATIONALE.md](docs/DESIGN-RATIONALE.md).* | + +The short version: Karpathy published the idea, the community found the +holes, and this repo is the automation layer that plugs the holes. +See **[`docs/DESIGN-RATIONALE.md`](docs/DESIGN-RATIONALE.md)** for the +full argument with honest residual trade-offs and what this repo +explicitly does NOT solve. + +--- + +## Compounding loop + +``` +┌─────────────────────┐ +│ Claude Code │ +│ sessions (.jsonl) │ +└──────────┬──────────┘ + │ extract-sessions.py (hourly, no LLM) + ▼ +┌─────────────────────┐ +│ conversations/ │ markdown transcripts +│ /*.md │ (status: extracted) +└──────────┬──────────┘ + │ summarize-conversations.py --claude (daily) + ▼ +┌─────────────────────┐ +│ conversations/ │ summaries with related: wiki links +│ /*.md │ (status: summarized) +└──────────┬──────────┘ + │ wiki-harvest.py (daily) + ▼ +┌─────────────────────┐ +│ raw/harvested/ │ fetched URL content +│ *.md │ (immutable source material) +└──────────┬──────────┘ + │ claude -p compile step + ▼ +┌─────────────────────┐ +│ staging// │ pending pages +│ *.md │ (status: pending, origin: automated) +└──────────┬──────────┘ + │ human review (wiki-staging.py --review) + ▼ +┌─────────────────────┐ +│ patterns/ │ LIVE wiki +│ decisions/ │ (origin: manual or promoted-from-automated) +│ concepts/ │ +│ environments/ │ +└──────────┬──────────┘ + │ wiki-hygiene.py (daily quick / weekly full) + │ - refresh last_verified from new conversations + │ - decay confidence on idle pages + │ - auto-restore archived pages referenced again + │ - fuzzy-fix broken cross-references + ▼ +┌─────────────────────┐ +│ archive// │ stale/superseded content +│ *.md │ (excluded from default search) +└─────────────────────┘ +``` + +Every arrow is automated. The only human step is staging review — and +that's quick because the AI compilation step already wrote the page, you +just approve or reject. + +--- + +## Quick start — two paths + +### Path A: just the idea (Karpathy-style) + +Open a Claude Code session in an empty directory and tell it: + +``` +I want you to start maintaining a persistent knowledge wiki for me. +Create a directory structure with patterns/, decisions/, concepts/, and +environments/ subdirectories. Each page should have YAML frontmatter with +title, type, confidence, sources, related, last_compiled, and last_verified +fields. Create an index.md at the root that catalogs every page. + +From now on, when I share a source (a doc page, a CLI --help, a conversation +I had), read it, extract what's load-bearing, and integrate it into the +wiki. Update existing pages when new knowledge refines them. Flag +contradictions between pages. Create new pages when topics aren't +covered yet. Update index.md every time you create or remove a page. + +When I ask a question, read the relevant wiki pages first, then answer. +If you rely on a wiki page with `confidence: low`, flag that to me. +``` + +That's the whole idea. The agent will build you a growing markdown tree +that compounds over time. This is the minimum viable version. + +### Path B: the full automation (this repo) + +```bash +git clone ~/projects/wiki +cd ~/projects/wiki + +# Install the Python extraction tools +pipx install trafilatura +pipx install crawl4ai && crawl4ai-setup + +# Install qmd for full-text + vector search +npm install -g @tobilu/qmd + +# Configure qmd (3 collections — see docs/SETUP.md for the YAML) +# Edit scripts/extract-sessions.py with your project codes +# Edit scripts/update-conversation-index.py with matching display names + +# Copy the example CLAUDE.md files (wiki schema + global instructions) +cp docs/examples/wiki-CLAUDE.md CLAUDE.md +cat docs/examples/global-CLAUDE.md >> ~/.claude/CLAUDE.md +# edit both for your conventions + +# Run the full pipeline once, manually +bash scripts/mine-conversations.sh --extract-only # Fast, no LLM +python3 scripts/summarize-conversations.py --claude # Classify + summarize +python3 scripts/update-conversation-index.py --reindex + +# Then maintain +bash scripts/wiki-maintain.sh # Daily hygiene +bash scripts/wiki-maintain.sh --hygiene-only --full # Weekly deep pass +``` + +See [`docs/SETUP.md`](docs/SETUP.md) for complete setup including qmd +configuration (three collections: `wiki`, `wiki-archive`, +`wiki-conversations`), optional cron schedules, git sync, and the +post-merge hook. See [`docs/examples/`](docs/examples/) for starter +`CLAUDE.md` files (wiki schema + global instructions) with explicit +guidance on using the three qmd collections. + +--- + +## Directory layout after setup + +``` +wiki/ +├── CLAUDE.md ← Schema + instructions the agent reads every session +├── index.md ← Content catalog (the agent reads this first) +├── patterns/ ← HOW things should be built (LIVE) +├── decisions/ ← WHY we chose this approach (LIVE) +├── concepts/ ← WHAT the foundational ideas are (LIVE) +├── environments/ ← WHERE implementations differ (LIVE) +├── staging/ ← PENDING automated content awaiting review +│ ├── index.md +│ └── / +├── archive/ ← STALE / superseded (excluded from search) +│ ├── index.md +│ └── / +├── raw/ ← Immutable source material (never modified) +│ ├── / +│ └── harvested/ ← URL harvester output +├── conversations/ ← Mined Claude Code session transcripts +│ ├── index.md +│ └── / +├── context/ ← Auto-updated AI session briefing +│ ├── wake-up.md ← Loaded at the start of every session +│ └── active-concerns.md ← Current blockers and focus areas +├── reports/ ← Hygiene operation logs +├── scripts/ ← The automation pipeline +├── tests/ ← Pytest suite (171 tests) +├── .harvest-state.json ← URL dedup state (committed, synced) +├── .hygiene-state.json ← Content hashes, deferred issues (committed, synced) +└── .mine-state.json ← Conversation extraction offsets (gitignored, per-machine) +``` + +--- + +## What's Claude-specific (and what isn't) + +This repo is built around **Claude Code** as the agent. Specifically: + +1. **Session mining** expects `~/.claude/projects//*.jsonl` + files written by the Claude Code CLI. Other agents won't produce these. +2. **Summarization** uses `claude -p` (the Claude Code CLI's one-shot mode) + with haiku/sonnet routing by conversation length. Other LLM CLIs would + need a different wrapper. +3. **URL compilation** uses `claude -p` to turn raw harvested content into + a wiki page with proper frontmatter. +4. **The agent itself** (the thing that reads `CLAUDE.md` and maintains the + wiki conversationally) is Claude Code. Any agent that reads markdown + and can write files could do this job — `CLAUDE.md` is just a text + file telling the agent what the wiki's conventions are. + +**What's NOT Claude-specific**: + +- The wiki schema (frontmatter, directory layout, lifecycle states) +- The staleness decay model and archive/restore semantics +- The human-in-the-loop staging workflow +- The hygiene checks (orphans, broken cross-refs, duplicates) +- The `trafilatura` + `crawl4ai` URL fetching +- The qmd search integration +- The git-based cross-machine sync + +If you use a different agent, you replace parts **1-4** above with +equivalents for your agent. The other 80% of the repo is agent-agnostic. +See [`docs/CUSTOMIZE.md`](docs/CUSTOMIZE.md) for concrete adaptation +recipes. + +--- + +## Architecture at a glance + +Eleven scripts organized in three layers: + +**Mining layer** (ingests conversations): +- `extract-sessions.py` — Parse Claude Code JSONL → markdown transcripts +- `summarize-conversations.py` — Classify + summarize via `claude -p` +- `update-conversation-index.py` — Regenerate conversation index + wake-up context + +**Automation layer** (maintains the wiki): +- `wiki_lib.py` — Shared frontmatter parser, `WikiPage` dataclass, constants +- `wiki-harvest.py` — URL classification + fetch cascade + compile to staging +- `wiki-staging.py` — Human review (list/promote/reject/review/sync) +- `wiki-hygiene.py` — Quick + full hygiene checks, archival, auto-restore +- `wiki-maintain.sh` — Top-level orchestrator chaining harvest + hygiene + +**Sync layer**: +- `wiki-sync.sh` — Git commit/pull/push with merge-union markdown handling +- `mine-conversations.sh` — Mining orchestrator + +See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for a deeper tour. + +--- + +## Why markdown, not a real database? + +Markdown files are: + +- **Human-readable without any tooling** — you can browse in Obsidian, VS Code, or `cat` +- **Git-native** — full history, branching, rollback, cross-machine sync for free +- **Agent-friendly** — every LLM was trained on markdown, so reading and writing it is free +- **Durable** — no schema migrations, no database corruption, no vendor lock-in +- **Interoperable** — Obsidian graph view, `grep`, `qmd`, `ripgrep`, any editor + +A SQLite file with the same content would be faster to query but harder +to browse, harder to merge, harder to audit, and fundamentally less +*collaborative* between you and the agent. Markdown wins for knowledge +management what Postgres wins for transactions. + +--- + +## Testing + +Full pytest suite in `tests/` — 171 tests across all scripts, runs in +**~1.3 seconds**, no network or LLM calls needed, works on macOS and +Linux/WSL. + +```bash +cd tests && python3 -m pytest +# or +bash tests/run.sh +``` + +The test suite uses a disposable `tmp_wiki` fixture so no test ever +touches your real wiki. + +--- + +## Credits and inspiration + +This repo is a synthesis of two existing ideas with an automation layer +on top. It would not exist without either of them. + +**Core pattern — [Andrej Karpathy — "Agent-Maintained Persistent Wiki" gist](https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f)** +The foundational idea of a compounding LLM-maintained wiki that moves +synthesis from query-time (RAG) to ingest-time. This repo is an +implementation of Karpathy's pattern with the community-identified +failure modes plugged. + +**Structural memory taxonomy — [milla-jovovich/mempalace](https://github.com/milla-jovovich/mempalace)** +The wing/room/hall/closet/drawer/tunnel concepts that turn a flat +corpus into something you can navigate without reading everything. See +[`ARCHITECTURE.md#borrowed-concepts`](docs/ARCHITECTURE.md#borrowed-concepts) +for the explicit mapping of MemPalace terms to this repo's +implementation. + +**Search layer — [qmd](https://github.com/tobi/qmd)** by Tobi Lütke +(Shopify CEO). Local BM25 + vector + LLM re-ranking on markdown files. +Chosen over ChromaDB because it uses the same storage format as the +wiki — one index to maintain, not two. Explicitly recommended by +Karpathy as well. + +**URL extraction stack** — [trafilatura](https://github.com/adbar/trafilatura) +for fast static-page extraction and [crawl4ai](https://github.com/unclecode/crawl4ai) +for JS-rendered and anti-bot cases. The two-tool cascade handles +essentially any web content without needing a full browser stack for +simple pages. + +**The agent** — [Claude Code](https://claude.com/claude-code) by Anthropic. +The repo is Claude-specific (see the section above for what that means +and how to adapt for other agents). + +**Design process** — this repo was designed interactively with Claude +as a structured Signal & Noise analysis before any code was written. +The interactive design artifact is here: +[The LLM Wiki — Karpathy's Pattern — Signal & Noise](https://claude.ai/public/artifacts/0f6e1d9b-3b8c-43df-99d7-3a4328a1620c). +That artifact walks through the seven real strengths and seven real +weaknesses of the core pattern, then works through concrete mitigations +for each weakness. Every component in this repo maps back to a specific +mitigation identified there. +[`docs/DESIGN-RATIONALE.md`](docs/DESIGN-RATIONALE.md) is the condensed +version of that analysis as it applies to this implementation. + +--- + +## License + +MIT — see [`LICENSE`](LICENSE). + +## Contributing + +This is a personal project that I'm making public in case the pattern is +useful to others. Issues and PRs welcome, but I make no promises about +response time. If you fork and make it your own, I'd love to hear how you +adapted it. diff --git a/config.example.yaml b/config.example.yaml new file mode 100644 index 0000000..2170a83 --- /dev/null +++ b/config.example.yaml @@ -0,0 +1,114 @@ +# Example configuration — copy to config.yaml and edit for your setup. +# +# This file is NOT currently read by any script (see docs/CUSTOMIZE.md +# "What I'd change if starting over" #1). The scripts use inline +# constants with "CONFIGURE ME" comments instead. This file is a +# template for a future refactor and a reference for what the +# configurable surface looks like. +# +# For now, edit the constants directly in: +# scripts/extract-sessions.py (PROJECT_MAP) +# scripts/update-conversation-index.py (PROJECT_NAMES, PROJECT_ORDER) +# scripts/wiki-harvest.py (SKIP_DOMAIN_PATTERNS) + +# ─── Project / wing configuration ────────────────────────────────────────── +projects: + # Map Claude Code directory suffixes to short project codes (wings) + map: + projects-wiki: wiki # this wiki's own sessions + -claude: cl # ~/.claude config repo + my-webapp: web # your project dirs + mobile-app: mob + work-monorepo: work + -home: general # catch-all + -Users: general + + # Display names for each project code + names: + wiki: WIKI — This Wiki + cl: CL — Claude Config + web: WEB — My Webapp + mob: MOB — Mobile App + work: WORK — Day Job + general: General — Cross-Project + + # Display order (most-active first) + order: + - work + - web + - mob + - wiki + - cl + - general + +# ─── URL harvesting configuration ────────────────────────────────────────── +harvest: + # Domains to always skip (internal, ephemeral, personal). + # Patterns use re.search, so unanchored suffixes like \.example\.com$ work. + skip_domains: + - \.atlassian\.net$ + - ^app\.asana\.com$ + - ^(www\.)?slack\.com$ + - ^(www\.)?discord\.com$ + - ^mail\.google\.com$ + - ^calendar\.google\.com$ + - ^.+\.local$ + - ^.+\.internal$ + # Add your own: + - \.mycompany\.com$ + - ^git\.mydomain\.com$ + + # Type C URLs (issue trackers, Q&A) — only harvested if topic covered + c_type_patterns: + - ^https?://github\.com/[^/]+/[^/]+/issues/\d+ + - ^https?://github\.com/[^/]+/[^/]+/pull/\d+ + - ^https?://(www\.)?stackoverflow\.com/questions/\d+ + + # Fetch behavior + fetch_delay_seconds: 2 + max_failed_attempts: 3 + min_content_length: 100 + fetch_timeout: 45 + +# ─── Hygiene / staleness configuration ───────────────────────────────────── +hygiene: + # Confidence decay thresholds (days since last_verified) + decay: + high_to_medium: 180 # 6 months + medium_to_low: 270 # 9 months (6+3) + low_to_stale: 365 # 12 months (6+3+3) + + # Pages with body shorter than this are flagged as stubs + empty_stub_threshold_chars: 100 + + # Version regex for technology lifecycle checks (which tools to track) + version_regex: '\b(?:Node(?:\.js)?|Python|Docker|PostgreSQL|MySQL|Redis|Next\.js|NestJS)\s+(\d+(?:\.\d+)?)' + +# ─── LLM configuration ───────────────────────────────────────────────────── +llm: + # Which backend to use for summarization and compilation + # Options: claude | openai | local | ollama + backend: claude + + # Routing threshold — sessions/content above this use the larger model + long_threshold_chars: 20000 + long_threshold_messages: 200 + + # Per-backend settings + claude: + short_model: haiku + long_model: sonnet + timeout: 600 + + openai: + short_model: gpt-4o-mini + long_model: gpt-4o + api_key_env: OPENAI_API_KEY + + local: + base_url: http://localhost:8080/v1 + model: Phi-4-14B-Q4_K_M + + ollama: + base_url: http://localhost:11434/v1 + model: phi4:14b diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..39811c8 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,360 @@ +# Architecture + +Eleven scripts across three conceptual layers. This document walks through +what each one does, how they talk to each other, and where the seams are +for customization. + +> **See also**: [`DESIGN-RATIONALE.md`](DESIGN-RATIONALE.md) — the *why* +> behind each component, with links to the interactive design artifact. + +## Borrowed concepts + +The architecture is a synthesis of two external ideas with an automation +layer on top. The terminology often maps 1:1, so it's worth calling out +which concepts came from where: + +### From Karpathy's persistent-wiki gist + +| Concept | How this repo implements it | +|---------|-----------------------------| +| Immutable `raw/` sources | `raw/` directory — never modified by the agent | +| LLM-compiled `wiki/` pages | `patterns/` `decisions/` `concepts/` `environments/` | +| Schema file disciplining the agent | `CLAUDE.md` at the wiki root | +| Periodic "lint" passes | `wiki-hygiene.py --quick` (daily) + `--full` (weekly) | +| Wiki as fine-tuning material | Clean markdown body is ready for synthetic training data | + +### From [mempalace](https://github.com/milla-jovovich/mempalace) + +MemPalace gave us the structural memory taxonomy that turns a flat +corpus into something you can navigate without reading everything. The +concepts map directly: + +| MemPalace term | Meaning | How this repo implements it | +|----------------|---------|-----------------------------| +| **Wing** | Per-person or per-project namespace | Project code in `conversations//` (set by `PROJECT_MAP` in `extract-sessions.py`) | +| **Room** | Topic within a wing | `topics:` frontmatter field on summarized conversation files | +| **Closet** | Summary layer — high-signal compressed knowledge | The summary body written by `summarize-conversations.py --claude` | +| **Drawer** | Verbatim archive, never lost | The extracted transcript under `conversations//*.md` (before summarization) | +| **Hall** | Memory-type corridor (fact / event / discovery / preference / advice / tooling) | `halls:` frontmatter field classified by the summarizer | +| **Tunnel** | Cross-wing connection — same topic in multiple projects | `related:` frontmatter linking conversations to wiki pages and to each other | + +The key benefit of wing + room filtering is documented in MemPalace's +benchmarks as a **+34% retrieval boost** over flat search — because +`qmd` can search a pre-narrowed subset of the corpus instead of +everything. This is why the wiki scales past the Karpathy-pattern's +~50K token ceiling without needing a full vector DB rebuild. + +### What this repo adds + +Automation + lifecycle management on top of both: + +- **Automation layer** — cron-friendly orchestration via `wiki-maintain.sh` +- **Staging pipeline** — human-in-the-loop checkpoint for automated content +- **Confidence decay + auto-archive + auto-restore** — the retention curve +- **`qmd` integration** — the scalable search layer (chosen over ChromaDB + because it uses markdown storage like the wiki itself) +- **Hygiene reports** — fixed vs needs-review separation +- **Cross-machine sync** — git with markdown merge-union + +--- + +## Overview + +``` + ┌─────────────────────────────────┐ + │ SYNC LAYER │ + │ wiki-sync.sh │ (git commit/pull/push, qmd reindex) + └─────────────────────────────────┘ + │ + ┌─────────────────────────────────┐ + │ MINING LAYER │ + │ extract-sessions.py │ (Claude Code JSONL → markdown) + │ summarize-conversations.py │ (LLM classify + summarize) + │ update-conversation-index.py │ (regenerate indexes + wake-up) + │ mine-conversations.sh │ (orchestrator) + └─────────────────────────────────┘ + │ + ┌─────────────────────────────────┐ + │ AUTOMATION LAYER │ + │ wiki_lib.py (shared helpers) │ + │ wiki-harvest.py │ (URL → raw → staging) + │ wiki-staging.py │ (human review) + │ wiki-hygiene.py │ (decay, archive, repair, checks) + │ wiki-maintain.sh │ (orchestrator) + └─────────────────────────────────┘ +``` + +Each layer is independent — you can run the mining layer without the +automation layer, or vice versa. The layers communicate through files on +disk (conversation markdown, raw harvested pages, staging pages, wiki +pages), never through in-memory state. + +--- + +## Mining layer + +### `extract-sessions.py` + +Parses Claude Code JSONL session files from `~/.claude/projects/` into +clean markdown transcripts under `conversations//`. +Deterministic, no LLM calls. Incremental — tracks byte offsets in +`.mine-state.json` so it safely re-runs on partially-processed sessions. + +Key features: +- Summarizes tool calls intelligently: full output for `Bash` and `Skill`, + paths-only for `Read`/`Glob`/`Grep`, path + summary for `Edit`/`Write` +- Caps Bash output at 200 lines to prevent transcript bloat +- Handles session resumption — if a session has grown since last extraction, + it appends new messages without re-processing old ones +- Maps Claude project directory names to short wiki codes via `PROJECT_MAP` + +### `summarize-conversations.py` + +Sends extracted transcripts to an LLM for classification and summarization. +Supports two backends: + +1. **`--claude` mode** (recommended): Uses `claude -p` with + haiku for short sessions (≤200 messages) and sonnet for longer ones. + Runs chunked over long transcripts, keeping a rolling context window. + +2. **Local LLM mode** (default, omit `--claude`): Uses a local + `llama-server` instance at `localhost:8080` (or WSL gateway:8081 on + Windows Subsystem for Linux). Requires llama.cpp installed and a GGUF + model loaded. + +Output: adds frontmatter to each conversation file — `topics`, `halls` +(fact/discovery/preference/advice/event/tooling), and `related` wiki +page links. The `related` links are load-bearing: they're what +`wiki-hygiene.py` uses to refresh `last_verified` on pages that are still +being discussed. + +### `update-conversation-index.py` + +Regenerates three files from the summarized conversations: + +- `conversations/index.md` — catalog of all conversations grouped by project +- `context/wake-up.md` — a ~200-token briefing the agent loads at the start + of every session ("current focus areas, recent decisions, active + concerns") +- `context/active-concerns.md` — longer-form current state + +The wake-up file is important: it's what gives the agent *continuity* +across sessions without forcing you to re-explain context every time. + +### `mine-conversations.sh` + +Orchestrator chaining extract → summarize → index. Supports +`--extract-only`, `--summarize-only`, `--index-only`, `--project `, +and `--dry-run`. + +--- + +## Automation layer + +### `wiki_lib.py` + +The shared library. Everything in the automation layer imports from here. +Provides: + +- `WikiPage` dataclass — path + frontmatter + body + raw YAML +- `parse_page(path)` — safe markdown parser with YAML frontmatter +- `parse_yaml_lite(text)` — subset YAML parser (no external deps, handles + the frontmatter patterns we use) +- `serialize_frontmatter(fm)` — writes YAML back in canonical key order +- `write_page(page, ...)` — full round-trip writer +- `page_content_hash(page)` — body-only SHA-256 for change detection +- `iter_live_pages()` / `iter_staging_pages()` / `iter_archived_pages()` +- Shared constants: `WIKI_DIR`, `STAGING_DIR`, `ARCHIVE_DIR`, etc. + +All paths honor the `WIKI_DIR` environment variable, so tests and +alternate installs can override the root. + +### `wiki-harvest.py` + +Scans summarized conversations for HTTP(S) URLs, classifies them, +fetches content, and compiles pending wiki pages. + +URL classification: +- **Harvest** (Type A/B) — docs, articles, blogs → fetch and compile +- **Check** (Type C) — GitHub issues, Stack Overflow — only harvest if + the topic is already covered in the wiki (to avoid noise) +- **Skip** (Type D) — internal domains, localhost, private IPs, chat tools + +Fetch cascade (tries in order, validates at each step): +1. `trafilatura -u --markdown --no-comments --precision` +2. `crwl -o markdown-fit` +3. `crwl -o markdown-fit -b "user_agent_mode=random" -c "magic=true"` (stealth) +4. Conversation-transcript fallback — pull inline content from where the + URL was mentioned during the session + +Validated content goes to `raw/harvested/-.md` with +frontmatter recording source URL, fetch method, and a content hash. + +Compilation step: sends the raw content + `index.md` + conversation +context to `claude -p`, asking for a JSON verdict: +- `new_page` — create a new wiki page +- `update_page` — update an existing page (with `modifies:` field) +- `both` — do both +- `skip` — content isn't substantive enough + +Result lands in `staging//` with `origin: automated`, +`status: pending`, and all the staging-specific frontmatter that gets +stripped on promotion. + +### `wiki-staging.py` + +Pure file operations — no LLM calls. Human review pipeline for automated +content. + +Commands: +- `--list` / `--list --json` — pending items with metadata +- `--stats` — counts by type/source + age stats +- `--review` — interactive a/r/s/q loop with preview +- `--promote ` — approve, strip staging fields, move to live, update + main index, rewrite cross-refs, preserve `origin: automated` as audit trail +- `--reject --reason "..."` — delete, record in + `.harvest-state.json` rejected_urls so the harvester won't re-create +- `--promote-all` — bulk approve everything +- `--sync` — regenerate `staging/index.md`, detect drift + +### `wiki-hygiene.py` + +The heavy lifter. Two modes: + +**Quick mode** (no LLM, ~1 second on a 100-page wiki, run daily): +- Backfill `last_verified` from `last_compiled`/git/mtime +- Refresh `last_verified` from conversation `related:` links — this is + the "something's still being discussed" signal +- Auto-restore archived pages that are referenced again +- Repair frontmatter (missing/invalid fields get sensible defaults) +- Apply confidence decay per thresholds (6/9/12 months) +- Archive stale and superseded pages +- Detect index drift (pages on disk not in index, stale index entries) +- Detect orphan pages (no inbound links) and auto-add them to index +- Detect broken cross-references, fuzzy-match to the intended target + via `difflib.get_close_matches`, fix in place +- Report empty stubs (body < 100 chars) +- Detect state file drift (references to missing files) +- Regenerate `staging/index.md` and `archive/index.md` if out of sync + +**Full mode** (LLM-powered, run weekly — extends quick mode with): +- Missing cross-references (haiku, batched 5 pages per call) +- Duplicate coverage (sonnet — weaker merged into stronger, auto-archives + the loser with `archived_reason: Merged into `) +- Contradictions (sonnet, **report-only** — the human decides) +- Technology lifecycle (regex + conversation comparison — flags pages + mentioning `Node 18` when recent conversations are using `Node 20`) + +State lives in `.hygiene-state.json` — tracks content hashes per page so +full-mode runs can skip unchanged pages. Reports land in +`reports/hygiene-YYYY-MM-DD-{fixed,needs-review}.md`. + +### `wiki-maintain.sh` + +Top-level orchestrator: + +``` +Phase 1: wiki-harvest.py (unless --hygiene-only) +Phase 2: wiki-hygiene.py (--full for the weekly pass, else quick) +Phase 3: qmd update && qmd embed (unless --no-reindex or --dry-run) +``` + +Flags pass through to child scripts. Error-tolerant: if one phase fails, +the others still run. Logs to `scripts/.maintain.log`. + +--- + +## Sync layer + +### `wiki-sync.sh` + +Git-based sync for cross-machine use. Commands: + +- `--commit` — stage and commit local changes +- `--pull` — `git pull` with markdown merge-union (keeps both sides on conflict) +- `--push` — push to origin +- `full` — commit + pull + push + qmd reindex +- `--status` — read-only sync state report + +The `.gitattributes` file sets `*.md merge=union` so markdown conflicts +auto-resolve by keeping both versions. This works because most conflicts +are additive (two machines both adding new entries). + +--- + +## State files + +Three JSON files track per-pipeline state: + +| File | Owner | Synced? | Purpose | +|------|-------|---------|---------| +| `.mine-state.json` | `extract-sessions.py`, `summarize-conversations.py` | No (gitignored) | Per-session byte offsets — local filesystem state, not portable | +| `.harvest-state.json` | `wiki-harvest.py` | Yes (committed) | URL dedup — harvested/skipped/failed/rejected URLs | +| `.hygiene-state.json` | `wiki-hygiene.py` | Yes (committed) | Page content hashes, deferred issues, last-run timestamps | + +Harvest and hygiene state need to sync across machines so both +installations agree on what's been processed. Mining state is per-machine +because Claude Code session files live at OS-specific paths. + +--- + +## Module dependency graph + +``` +wiki_lib.py ─┬─> wiki-harvest.py + ├─> wiki-staging.py + └─> wiki-hygiene.py + +wiki-maintain.sh ─> wiki-harvest.py + ─> wiki-hygiene.py + ─> qmd (external) + +mine-conversations.sh ─> extract-sessions.py + ─> summarize-conversations.py + ─> update-conversation-index.py + +extract-sessions.py (standalone — reads Claude JSONL) +summarize-conversations.py ─> claude CLI (or llama-server) +update-conversation-index.py ─> qmd (external) +``` + +`wiki_lib.py` is the only shared Python module — everything else is +self-contained within its layer. + +--- + +## Extension seams + +The places to modify when customizing: + +1. **`scripts/extract-sessions.py`** — `PROJECT_MAP` controls how Claude + project directories become wiki "wings". Also `KEEP_FULL_OUTPUT_TOOLS`, + `SUMMARIZE_TOOLS`, `MAX_BASH_OUTPUT_LINES` to tune transcript shape. + +2. **`scripts/update-conversation-index.py`** — `PROJECT_NAMES` and + `PROJECT_ORDER` control how the index groups conversations. + +3. **`scripts/wiki-harvest.py`** — + - `SKIP_DOMAIN_PATTERNS` — your internal domains + - `C_TYPE_URL_PATTERNS` — URL shapes that need topic-match before harvesting + - `FETCH_DELAY_SECONDS` — rate limit between fetches + - `COMPILE_PROMPT_TEMPLATE` — what the AI compile step tells the LLM + - `SONNET_CONTENT_THRESHOLD` — size cutoff for haiku vs sonnet + +4. **`scripts/wiki-hygiene.py`** — + - `DECAY_HIGH_TO_MEDIUM` / `DECAY_MEDIUM_TO_LOW` / `DECAY_LOW_TO_STALE` + — decay thresholds in days + - `EMPTY_STUB_THRESHOLD` — what counts as a stub + - `VERSION_REGEX` — which tools/runtimes to track for lifecycle checks + - `REQUIRED_FIELDS` — frontmatter fields the repair step enforces + +5. **`scripts/summarize-conversations.py`** — + - `CLAUDE_LONG_THRESHOLD` — haiku/sonnet routing cutoff + - `MINE_PROMPT_FILE` — the LLM system prompt for summarization + - Backend selection (claude vs llama-server) + +6. **`CLAUDE.md`** at the wiki root — the instructions the agent reads + every session. This is where you tell the agent how to maintain the + wiki, what conventions to follow, when to flag things to you. + +See [`docs/CUSTOMIZE.md`](CUSTOMIZE.md) for recipes. diff --git a/docs/CUSTOMIZE.md b/docs/CUSTOMIZE.md new file mode 100644 index 0000000..e4e641f --- /dev/null +++ b/docs/CUSTOMIZE.md @@ -0,0 +1,432 @@ +# Customization Guide + +This repo is built around Claude Code, cron-based automation, and a +specific directory layout. None of those are load-bearing for the core +idea. This document walks through adapting it for different agents, +different scheduling, and different subsets of functionality. + +## What's actually required for the core idea + +The minimum viable compounding wiki is: + +1. A markdown directory tree +2. An agent that reads the tree at the start of a session and writes to + it during the session +3. Some convention (a `CLAUDE.md` or equivalent) telling the agent how to + maintain the wiki + +**Everything else in this repo is optional optimization** — automated +extraction, URL harvesting, hygiene checks, cron scheduling. They're +worth the setup effort once the wiki grows past a few dozen pages, but +they're not the *idea*. + +--- + +## Adapting for non-Claude-Code agents + +Four script components are Claude-specific. Each has a natural +replacement path: + +### 1. `extract-sessions.py` — Claude Code JSONL parsing + +**What it does**: Reads session files from `~/.claude/projects/` and +converts them to markdown transcripts. + +**What's Claude-specific**: The JSONL format and directory structure are +specific to the Claude Code CLI. Other agents don't produce these files. + +**Replacements**: + +- **Cursor**: Cursor stores chat history in `~/Library/Application + Support/Cursor/User/globalStorage/` (macOS) as SQLite. Write an + equivalent `extract-sessions.py` that queries that SQLite and produces + the same markdown format. +- **Aider**: Aider stores chat history as `.aider.chat.history.md` in + each project directory. A much simpler extractor: walk all project + directories, read each `.aider.chat.history.md`, split on session + boundaries, write to `conversations//`. +- **OpenAI Codex / gemini CLI / other**: Whatever session format your + tool uses — the target format is a markdown file with a specific + frontmatter shape (`title`, `type: conversation`, `project`, `date`, + `status: extracted`, `messages: N`, body of user/assistant turns). + Anything that produces files in that shape will flow through the rest + of the pipeline unchanged. +- **No agent at all — just manual**: Skip this script entirely. Paste + interesting conversations into `conversations/general/YYYY-MM-DD-slug.md` + by hand and set `status: extracted` yourself. + +The pipeline downstream of `extract-sessions.py` doesn't care how the +transcripts got there, only that they exist with the right frontmatter. + +### 2. `summarize-conversations.py` — `claude -p` summarization + +**What it does**: Classifies extracted conversations into "halls" +(fact/discovery/preference/advice/event/tooling) and writes summaries. + +**What's Claude-specific**: Uses `claude -p` with haiku/sonnet routing. + +**Replacements**: + +- **OpenAI**: Replace the `call_claude` helper with a function that calls + `openai` Python SDK or `gpt` CLI. Use gpt-4o-mini for short + conversations (equivalent to haiku routing) and gpt-4o for long ones. +- **Local LLM**: The script already supports this path — just omit the + `--claude` flag and run a `llama-server` on localhost:8080 (or the WSL + gateway IP on Windows). Phi-4-14B scored 400/400 on our internal eval. +- **Ollama**: Point `AI_BASE_URL` at your Ollama endpoint (e.g. + `http://localhost:11434/v1`). Ollama exposes an OpenAI-compatible API. +- **Any OpenAI-compatible endpoint**: `AI_BASE_URL` and `AI_MODEL` env + vars configure the script — no code changes needed. +- **No LLM at all — manual summaries**: Edit each conversation file by + hand to set `status: summarized` and add your own `topics`/`related` + frontmatter. Tedious but works for a small wiki. + +### 3. `wiki-harvest.py` — AI compile step + +**What it does**: After fetching raw URL content, sends it to `claude -p` +to get a structured JSON verdict (new_page / update_page / both / skip) +plus the page content. + +**What's Claude-specific**: `claude -p --model haiku|sonnet`. + +**Replacements**: + +- **Any other LLM**: Replace `call_claude_compile()` with a function that + calls your preferred backend. The prompt template + (`COMPILE_PROMPT_TEMPLATE`) is reusable — just swap the transport. +- **Skip AI compilation entirely**: Run `wiki-harvest.py --no-compile` + and the harvester will save raw content to `raw/harvested/` without + trying to compile it. You can then manually (or via a different script) + turn the raw content into wiki pages. + +### 4. `wiki-hygiene.py --full` — LLM-powered checks + +**What it does**: Duplicate detection, contradiction detection, missing +cross-reference suggestions. + +**What's Claude-specific**: `claude -p --model haiku|sonnet`. + +**Replacements**: + +- **Same as #3**: Replace the `call_claude()` helper in `wiki-hygiene.py`. +- **Skip full mode entirely**: Only run `wiki-hygiene.py --quick` (the + default). Quick mode has no LLM calls and catches 90% of structural + issues. Contradictions and duplicates just have to be caught by human + review during `wiki-staging.py --review` sessions. + +### 5. `CLAUDE.md` at the wiki root + +**What it does**: The instructions Claude Code reads at the start of +every session that explain the wiki schema and maintenance operations. + +**What's Claude-specific**: The filename. Claude Code specifically looks +for `CLAUDE.md`; other agents look for other files. + +**Replacements**: + +| Agent | Equivalent file | +|-------|-----------------| +| Claude Code | `CLAUDE.md` | +| Cursor | `.cursorrules` or `.cursor/rules/` | +| Aider | `CONVENTIONS.md` (read via `--read CONVENTIONS.md`) | +| Gemini CLI | `GEMINI.md` | +| Continue.dev | `config.json` prompts or `.continue/rules/` | + +The content is the same — just rename the file and point your agent at +it. + +--- + +## Running without cron + +Cron is convenient but not required. Alternatives: + +### Manual runs + +Just call the scripts when you want the wiki updated: + +```bash +cd ~/projects/wiki + +# When you want to ingest new Claude Code sessions +bash scripts/mine-conversations.sh + +# When you want hygiene + harvest +bash scripts/wiki-maintain.sh + +# When you want the expensive LLM pass +bash scripts/wiki-maintain.sh --hygiene-only --full +``` + +This is arguably *better* than cron if you work in bursts — run +maintenance when you start a session, not on a schedule. + +### systemd timers (Linux) + +More observable than cron, better journaling: + +```ini +# ~/.config/systemd/user/wiki-maintain.service +[Unit] +Description=Wiki maintenance pipeline + +[Service] +Type=oneshot +WorkingDirectory=%h/projects/wiki +ExecStart=/usr/bin/bash %h/projects/wiki/scripts/wiki-maintain.sh +``` + +```ini +# ~/.config/systemd/user/wiki-maintain.timer +[Unit] +Description=Run wiki-maintain daily + +[Timer] +OnCalendar=daily +Persistent=true + +[Install] +WantedBy=timers.target +``` + +```bash +systemctl --user enable --now wiki-maintain.timer +journalctl --user -u wiki-maintain.service # see logs +``` + +### launchd (macOS) + +More native than cron on macOS: + +```xml + + + + + + Labelcom.user.wiki-maintain + ProgramArguments + + /bin/bash + /Users/YOUR_USER/projects/wiki/scripts/wiki-maintain.sh + + StartCalendarInterval + + Hour3 + Minute0 + + StandardOutPath/tmp/wiki-maintain.log + StandardErrorPath/tmp/wiki-maintain.err + + +``` + +```bash +launchctl load ~/Library/LaunchAgents/com.user.wiki-maintain.plist +launchctl list | grep wiki # verify +``` + +### Git hooks (pre-push) + +Run hygiene before every push so the wiki is always clean when it hits +the remote: + +```bash +cat > ~/projects/wiki/.git/hooks/pre-push <<'HOOK' +#!/usr/bin/env bash +set -euo pipefail +bash ~/projects/wiki/scripts/wiki-maintain.sh --hygiene-only --no-reindex +HOOK +chmod +x ~/projects/wiki/.git/hooks/pre-push +``` + +Downside: every push is slow. Upside: you never push a broken wiki. + +### CI pipeline + +Run `wiki-hygiene.py --check-only` in a CI workflow on every PR: + +```yaml +# .github/workflows/wiki-check.yml (or .gitea/workflows/...) +name: Wiki hygiene check +on: [push, pull_request] +jobs: + hygiene: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - run: python3 scripts/wiki-hygiene.py --check-only +``` + +`--check-only` reports issues without auto-fixing them, so CI can flag +problems without modifying files. + +--- + +## Minimal subsets + +You don't have to run the whole pipeline. Pick what's useful: + +### "Just the wiki" (no automation) + +- Delete `scripts/wiki-*` and `scripts/*-conversations*` +- Delete `tests/` +- Keep the directory structure (`patterns/`, `decisions/`, etc.) +- Keep `index.md` and `CLAUDE.md` +- Write and maintain the wiki manually with your agent + +This is the Karpathy-gist version. Works great for small wikis. + +### "Wiki + mining" (no harvesting, no hygiene) + +- Keep the mining layer (`extract-sessions.py`, `summarize-conversations.py`, `update-conversation-index.py`) +- Delete the automation layer (`wiki-harvest.py`, `wiki-hygiene.py`, `wiki-staging.py`, `wiki-maintain.sh`) +- The wiki grows from session mining but you maintain it manually + +Useful if you want session continuity (the wake-up briefing) without +the full automation. + +### "Wiki + hygiene" (no mining, no harvesting) + +- Keep `wiki-hygiene.py` and `wiki_lib.py` +- Delete everything else +- Run `wiki-hygiene.py --quick` periodically to catch structural issues + +Useful if you write the wiki manually but want automated checks for +orphans, broken links, and staleness. + +### "Wiki + harvesting" (no session mining) + +- Keep `wiki-harvest.py`, `wiki-staging.py`, `wiki_lib.py` +- Delete mining scripts +- Source URLs manually — put them in a file and point the harvester at + it. You'd need to write a wrapper that extracts URLs from your source + file and feeds them into the fetch cascade. + +Useful if URLs come from somewhere other than Claude Code sessions +(e.g. browser bookmarks, Pocket export, RSS). + +--- + +## Schema customization + +The repo uses these live content types: + +- `patterns/` — HOW things should be built +- `decisions/` — WHY we chose this approach +- `concepts/` — WHAT the foundational ideas are +- `environments/` — WHERE implementations differ + +These reflect my engineering-focused use case. Your wiki might need +different categories. To change them: + +1. Rename / add directories under the wiki root +2. Edit `LIVE_CONTENT_DIRS` in `scripts/wiki_lib.py` +3. Update the `type:` frontmatter validation in + `scripts/wiki-hygiene.py` (`VALID_TYPES` constant) +4. Update `CLAUDE.md` to describe the new categories +5. Update `index.md` section headers to match + +Examples of alternative schemas: + +**Research wiki**: +- `findings/` — experimental results +- `hypotheses/` — what you're testing +- `methods/` — how you test +- `literature/` — external sources + +**Product wiki**: +- `features/` — what the product does +- `decisions/` — why we chose this +- `users/` — personas, interviews, feedback +- `metrics/` — what we measure + +**Personal knowledge wiki**: +- `topics/` — general subject matter +- `projects/` — specific ongoing work +- `journal/` — dated entries +- `references/` — external links/papers + +None of these are better or worse — pick what matches how you think. + +--- + +## Frontmatter customization + +The required fields are documented in `CLAUDE.md` (frontmatter spec). +You can add your own fields freely — the parser and hygiene checks +ignore unknown keys. + +Useful additions you might want: + +```yaml +author: alice # who wrote or introduced the page +tags: [auth, security] # flat tag list +urgency: high # for to-do-style wiki pages +stakeholders: # who cares about this page + - product-team + - security-team +review_by: 2026-06-01 # explicit review date instead of age-based decay +``` + +If you want age-based decay to key off a different field than +`last_verified` (say, `review_by`), edit `expected_confidence()` in +`scripts/wiki-hygiene.py` to read from your custom field. + +--- + +## Working across multiple wikis + +The scripts all honor the `WIKI_DIR` environment variable. Run multiple +wikis against the same scripts: + +```bash +# Work wiki +WIKI_DIR=~/projects/work-wiki bash scripts/wiki-maintain.sh + +# Personal wiki +WIKI_DIR=~/projects/personal-wiki bash scripts/wiki-maintain.sh + +# Research wiki +WIKI_DIR=~/projects/research-wiki bash scripts/wiki-maintain.sh +``` + +Each has its own state files, its own cron entries, its own qmd +collection. You can symlink or copy `scripts/` into each wiki, or run +all three against a single checked-out copy of the scripts. + +--- + +## What I'd change if starting over + +Honest notes on the design choices, in case you're about to fork: + +1. **Config should be in YAML, not inline constants.** I bolted a + "CONFIGURE ME" comment onto `PROJECT_MAP` and `SKIP_DOMAIN_PATTERNS` + as a shortcut. Better: a `config.yaml` at the wiki root that all + scripts read. + +2. **The mining layer is tightly coupled to Claude Code.** A cleaner + design would put a `Session` interface in `wiki_lib.py` and have + extractors for each agent produce `Session` objects — the rest of the + pipeline would be agent-agnostic. + +3. **The hygiene script is a monolith.** 1100+ lines is a lot. Splitting + it into `wiki_hygiene/checks.py`, `wiki_hygiene/archive.py`, + `wiki_hygiene/llm.py`, etc., would be cleaner. It started as a single + file and grew. + +4. **The hyphenated filenames (`wiki-harvest.py`) make Python imports + awkward.** Standard Python convention is underscores. I used hyphens + for consistency with the shell scripts, and `conftest.py` has a + module-loader workaround. A cleaner fork would use underscores + everywhere. + +5. **The wiki schema assumes you know what you want to catalog.** If + you don't, start with a free-form `notes/` directory and let + categories emerge organically, then refactor into `patterns/` etc. + later. + +None of these are blockers. They're all "if I were designing v2" +observations. diff --git a/docs/DESIGN-RATIONALE.md b/docs/DESIGN-RATIONALE.md new file mode 100644 index 0000000..b9c3487 --- /dev/null +++ b/docs/DESIGN-RATIONALE.md @@ -0,0 +1,338 @@ +# Design Rationale — Signal & Noise + +Why each part of this repo exists. This is the "why" document; the other +docs are the "what" and "how." + +Before implementing anything, the design was worked out interactively +with Claude as a structured Signal & Noise analysis of Andrej Karpathy's +original persistent-wiki pattern: + +> **Interactive design artifact**: [The LLM Wiki — Karpathy's Pattern — Signal & Noise](https://claude.ai/public/artifacts/0f6e1d9b-3b8c-43df-99d7-3a4328a1620c) + +That artifact walks through the pattern's seven genuine strengths, seven +real weaknesses, and concrete mitigations for each weakness. This repo +is the implementation of those mitigations. If you want to understand +*why* a component exists, the artifact has the longer-form argument; this +document is the condensed version. + +--- + +## Where the pattern is genuinely strong + +The analysis found seven strengths that hold up under scrutiny. This +repo preserves all of them: + +| Strength | How this repo keeps it | +|----------|-----------------------| +| **Knowledge compounds over time** | Every ingest adds to the existing wiki rather than restarting; conversation mining and URL harvesting continuously feed new material in | +| **Zero maintenance burden on humans** | Cron-driven harvest + hygiene; the only manual step is staging review, and that's fast because the AI already compiled the page | +| **Token-efficient at personal scale** | `index.md` fits in context; `qmd` kicks in only at 50+ articles; the wake-up briefing is ~200 tokens | +| **Human-readable & auditable** | Plain markdown everywhere; every cross-reference is visible; git history shows every change | +| **Future-proof & portable** | No vendor lock-in; you can point any agent at the same tree tomorrow | +| **Self-healing via lint passes** | `wiki-hygiene.py` runs quick checks daily and full (LLM) checks weekly | +| **Path to fine-tuning** | Wiki pages are high-quality synthetic training data once purified through hygiene | + +--- + +## Where the pattern is genuinely weak — and how this repo answers + +The analysis identified seven real weaknesses. Five have direct +mitigations in this repo; two remain open trade-offs you should be aware +of. + +### 1. Errors persist and compound + +**The problem**: Unlike RAG — where a hallucination is ephemeral and the +next query starts clean — an LLM wiki persists its mistakes. If the LLM +incorrectly links two concepts at ingest time, future ingests build on +that wrong prior. + +**How this repo mitigates**: + +- **`confidence` field** — every page carries `high`/`medium`/`low` with + decay based on `last_verified`. Wrong claims aren't treated as + permanent — they age out visibly. +- **Archive + restore** — decayed pages get moved to `archive/` where + they're excluded from default search. If they get referenced again + they're auto-restored with `confidence: medium` (never straight to + `high` — they have to re-earn trust). +- **Raw harvested material is immutable** — `raw/harvested/*.md` files + are the ground truth. Every compiled wiki page can be traced back to + its source via the `sources:` frontmatter field. +- **Full-mode contradiction detection** — `wiki-hygiene.py --full` uses + sonnet to find conflicting claims across pages. Report-only (humans + decide which side wins). +- **Staging review** — automated content goes to `staging/` first. + Nothing enters the live wiki without human approval, so errors have + two chances to get caught (AI compile + human review) before they + become persistent. + +### 2. Hard scale ceiling at ~50K tokens + +**The problem**: The wiki approach stops working when `index.md` no +longer fits in context. Karpathy's own wiki was ~100 articles / 400K +words — already near the ceiling. + +**How this repo mitigates**: + +- **`qmd` from day one** — `qmd` (BM25 + vector + LLM re-ranking) is set + up in the default configuration so the agent never has to load the + full index. At 50+ pages, `qmd search` replaces `cat index.md`. +- **Wing/room structural filtering** — conversations are partitioned by + project code (wing) and topic (room, via the `topics:` frontmatter). + Retrieval is pre-narrowed to the relevant wing before search runs. + This extends the effective ceiling because `qmd` works on a relevant + subset, not the whole corpus. +- **Hygiene full mode flags redundancy** — duplicate detection auto-merges + weaker pages into stronger ones, keeping the corpus lean. +- **Archive excludes stale content** — the `wiki-archive` collection has + `includeByDefault: false`, so archived pages don't eat context until + explicitly queried. + +### 3. Manual cross-checking burden returns in precision-critical domains + +**The problem**: For API specs, version constraints, legal records, and +medical protocols, LLM-generated content needs human verification. The +maintenance burden you thought you'd eliminated comes back as +verification overhead. + +**How this repo mitigates**: + +- **Staging workflow** — every automated page goes through human review. + For precision-critical content, that review IS the cross-check. The + AI does the drafting; you verify. +- **`compilation_notes` field** — staging pages include the AI's own + explanation of what it did and why. Makes review faster — you can + spot-check the reasoning rather than re-reading the whole page. +- **Immutable raw sources** — every wiki claim traces back to a specific + file in `raw/harvested/` with a SHA-256 `content_hash`. Verification + means comparing the claim to the source, not "trust the LLM." +- **`confidence: low` for precision domains** — the agent's instructions + (via `CLAUDE.md`) tell it to flag low-confidence content when + citing. Humans see the warning before acting. + +**Residual trade-off**: For *truly* mission-critical data (legal, +medical, compliance), no amount of automation replaces domain-expert +review. If that's your use case, treat this repo as a *drafting* tool, +not a canonical source. + +### 4. Knowledge staleness without active upkeep + +**The problem**: Community analysis of 120+ comments on Karpathy's gist +found this is the #1 failure mode. Most people who try the pattern get +the folder structure right and still end up with a wiki that slowly +becomes unreliable because they stop feeding it. Six-week half-life is +typical. + +**How this repo mitigates** (this is the biggest thing): + +- **Automation replaces human discipline** — daily cron runs + `wiki-maintain.sh` (harvest + hygiene + qmd reindex); weekly cron runs + `--full` mode. You don't need to remember anything. +- **Conversation mining is the feed** — you don't need to curate sources + manually. Every Claude Code session becomes potential ingest. The + feed is automatic and continuous, as long as you're doing work. +- **`last_verified` refreshes from conversation references** — when the + summarizer links a conversation to a wiki page via `related:`, the + hygiene script picks that up and bumps `last_verified`. Pages stay + fresh as long as they're still being discussed. +- **Decay thresholds force attention** — pages without refresh signals + for 6/9/12 months get downgraded and eventually archived. The wiki + self-trims. +- **Hygiene reports** — `reports/hygiene-YYYY-MM-DD-needs-review.md` + flags the things that *do* need human judgment. Everything else is + auto-fixed. + +This is the single biggest reason this repo exists. The automation +layer is entirely about removing "I forgot to lint" as a failure mode. + +### 5. Cognitive outsourcing risk + +**The problem**: Hacker News critics argued that the bookkeeping +Karpathy outsources — filing, cross-referencing, summarizing — is +precisely where genuine understanding forms. Outsource it and you end up +with a comprehensive wiki you haven't internalized. + +**How this repo mitigates**: + +- **Staging review is a forcing function** — you see every automated + page before it lands. Even skimming forces engagement with the + material. +- **`qmd query "..."` for exploration** — searching the wiki is an + active process, not passive retrieval. You're asking questions, not + pulling a file. +- **The wake-up briefing** — `context/wake-up.md` is a 200-token digest + the agent reads at session start. You read it too (or the agent reads + it to you) — ongoing re-exposure to your own knowledge base. + +**Residual trade-off**: This is a real concern even with mitigations. +The wiki is designed as *augmentation*, not *replacement*. If you +never read your own wiki and only consult it through the agent, you're +in the outsourcing failure mode. The fix is discipline, not +architecture. + +### 6. Weaker semantic retrieval than RAG at scale + +**The problem**: At large corpora, vector embeddings find semantically +related content across different wording in ways explicit wikilinks +can't match. + +**How this repo mitigates**: + +- **`qmd` is hybrid (BM25 + vector)** — not just keyword search. Vector + similarity is built into the retrieval pipeline from day one. +- **Structural navigation complements semantic search** — project codes + (wings) and topic frontmatter narrow the search space before the + hybrid search runs. Structure + semantics is stronger than either + alone. +- **Missing cross-reference detection** — full-mode hygiene asks the + LLM to find pages that *should* link to each other but don't, then + auto-adds them. This is the explicit-linking approach catching up to + semantic retrieval over time. + +**Residual trade-off**: At enterprise scale (millions of documents), a +proper vector DB with specialized retrieval wins. This repo is for +personal / small-team scale where the hybrid approach is sufficient. + +### 7. No access control or multi-user support + +**The problem**: It's a folder of markdown files. No RBAC, no audit +logging, no concurrency handling, no permissions model. + +**How this repo mitigates**: + +- **Git-based sync with merge-union** — concurrent writes on different + machines auto-resolve because markdown is set to `merge=union` in + `.gitattributes`. Both sides win. +- **Network boundary as soft access control** — the suggested + deployment is over Tailscale or a VPN, so the network does the work a + RBAC layer would otherwise do. Not enterprise-grade, but sufficient + for personal/family/small-team use. + +**Residual trade-off**: **This is the big one.** The repo is not a +replacement for enterprise knowledge management. No audit trails, no +fine-grained permissions, no compliance story. If you need any of +that, you need a different architecture. This repo is explicitly +scoped to the personal/small-team use case. + +--- + +## The #1 failure mode — active upkeep + +Every other weakness has a mitigation. *Active upkeep is the one that +kills wikis in the wild.* The community data is unambiguous: + +- People who automate the lint schedule → wikis healthy at 6+ months +- People who rely on "I'll remember to lint" → wikis abandoned at 6 weeks + +The entire automation layer of this repo exists to remove upkeep as a +thing the human has to think about: + +| Cadence | Job | Purpose | +|---------|-----|---------| +| Every 15 min | `wiki-sync.sh` | Commit/pull/push — cross-machine sync | +| Every 2 hours | `wiki-sync.sh full` | Full sync + qmd reindex | +| Every hour | `mine-conversations.sh --extract-only` | Capture new Claude Code sessions (no LLM) | +| Daily 2am | `summarize-conversations.py --claude` + index | Classify + summarize (LLM) | +| Daily 3am | `wiki-maintain.sh` | Harvest + quick hygiene + reindex | +| Weekly Sun 4am | `wiki-maintain.sh --hygiene-only --full` | LLM-powered duplicate/contradiction/cross-ref detection | + +If you disable all of these, you get the same outcome as every +abandoned wiki: six-week half-life. The scripts aren't optional +convenience — they're the load-bearing answer to the pattern's primary +failure mode. + +--- + +## What was borrowed from where + +This repo is a synthesis of two ideas with an automation layer on top: + +### From Karpathy + +- The core pattern: LLM-maintained persistent wiki, compile at ingest + time instead of retrieve at query time +- Separation of `raw/` (immutable sources) from `wiki/` (compiled pages) +- `CLAUDE.md` as the schema that disciplines the agent +- Periodic "lint" passes to catch orphans, contradictions, missing refs +- The idea that the wiki becomes fine-tuning material over time + +### From mempalace + +- **Wings** = per-person or per-project namespaces → this repo uses + project codes (`mc`, `wiki`, `web`, etc.) as the same thing in + `conversations//` +- **Rooms** = topics within a wing → the `topics:` frontmatter on + conversation files +- **Halls** = memory-type corridors (fact / event / discovery / + preference / advice / tooling) → the `halls:` frontmatter field + classified by the summarizer +- **Closets** = summary layer → the summary body of each summarized + conversation +- **Drawers** = verbatim archive, never lost → the extracted + conversation transcripts under `conversations//*.md` +- **Tunnels** = cross-wing connections → the `related:` frontmatter + linking conversations to wiki pages +- Wing + room structural filtering gives a documented +34% retrieval + boost over flat search + +The MemPalace taxonomy solved a problem Karpathy's pattern doesn't +address: how do you navigate a growing corpus without reading +everything? The answer is to give the corpus structural metadata at +ingest time, then filter on that metadata before doing semantic search. +This repo borrows that wholesale. + +### What this repo adds + +- **Automation layer** tying the pieces together with cron-friendly + orchestration +- **Staging pipeline** as a human-in-the-loop checkpoint for automated + content +- **Confidence decay + auto-archive + auto-restore** as the "retention + curve" that community analysis identified as critical for long-term + wiki health +- **`qmd` integration** as the scalable search layer (chosen over + ChromaDB because it uses the same markdown storage as the wiki — + one index to maintain, not two) +- **Hygiene reports** with fixed vs needs-review separation so + automation handles mechanical fixes and humans handle ambiguity +- **Cross-machine sync** via git with markdown merge-union so the same + wiki lives on multiple machines without merge hell + +--- + +## Honest residual trade-offs + +Five items from the analysis that this repo doesn't fully solve and +where you should know the limits: + +1. **Enterprise scale** — this is a personal/small-team tool. Millions + of documents, hundreds of users, RBAC, compliance: wrong + architecture. +2. **True semantic retrieval at massive scale** — `qmd` hybrid search + is great for thousands of pages, not millions. +3. **Cognitive outsourcing** — no architecture fix. Discipline + yourself to read your own wiki, not just query it through the agent. +4. **Precision-critical domains** — for legal/medical/regulatory data, + use this as a drafting tool, not a source of truth. Human + domain-expert review is not replaceable. +5. **Access control** — network boundary (Tailscale) is the fastest + path; nothing in the repo itself enforces permissions. + +If any of these are dealbreakers for your use case, a different +architecture is probably what you need. + +--- + +## Further reading + +- [The original Karpathy gist](https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f) + — the concept +- [mempalace](https://github.com/milla-jovovich/mempalace) — the + structural memory layer +- [Signal & Noise interactive analysis](https://claude.ai/public/artifacts/0f6e1d9b-3b8c-43df-99d7-3a4328a1620c) + — the design rationale this document summarizes +- [README](../README.md) — the concept pitch +- [ARCHITECTURE.md](ARCHITECTURE.md) — component deep-dive +- [SETUP.md](SETUP.md) — installation +- [CUSTOMIZE.md](CUSTOMIZE.md) — adapting for non-Claude-Code setups diff --git a/docs/SETUP.md b/docs/SETUP.md new file mode 100644 index 0000000..69514ae --- /dev/null +++ b/docs/SETUP.md @@ -0,0 +1,502 @@ +# Setup Guide + +Complete installation for the full automation pipeline. For the conceptual +version (just the idea, no scripts), see the "Quick start — Path A" section +in the [README](../README.md). + +Tested on macOS (work machines) and Linux/WSL2 (home machines). Should work +on any POSIX system with Python 3.11+, Node.js 18+, and bash. + +--- + +## 1. Prerequisites + +### Required + +- **git** with SSH or HTTPS access to your remote (for cross-machine sync) +- **Node.js 18+** (for `qmd` search) +- **Python 3.11+** (for all pipeline scripts) +- **`claude` CLI** with valid authentication — Max subscription OAuth or + API key. Required for summarization and the harvester's AI compile step. + Without `claude`, you can still use the wiki, but the automation layer + falls back to manual or local-LLM paths. + +### Python tools (recommended via `pipx`) + +```bash +# URL content extraction — required for wiki-harvest.py +pipx install trafilatura +pipx install crawl4ai && crawl4ai-setup # installs Playwright browsers +``` + +Verify: `trafilatura --version` and `crwl --help` should both work. + +### Optional + +- **`pytest`** — only needed to run the test suite (`pip install --user pytest`) +- **`llama.cpp` / `llama-server`** — only if you want the legacy local-LLM + summarization path instead of `claude -p` + +--- + +## 2. Clone the repo + +```bash +git clone ~/projects/wiki +cd ~/projects/wiki +``` + +The repo contains scripts, tests, docs, and example content — but no +actual wiki pages. The wiki grows as you use it. + +--- + +## 3. Configure qmd search + +`qmd` handles BM25 full-text search and vector search over the wiki. +The pipeline uses **three** collections: + +- **`wiki`** — live content (patterns/decisions/concepts/environments), + staging, and raw sources. The default search surface. +- **`wiki-archive`** — stale / superseded pages. Excluded from default + search; query explicitly with `-c wiki-archive` when digging into + history. +- **`wiki-conversations`** — mined Claude Code session transcripts. + Excluded from default search because they'd flood results with noisy + tool-call output; query explicitly with `-c wiki-conversations` when + looking for "what did I discuss about X last month?" + +```bash +npm install -g @tobilu/qmd +``` + +Configure via YAML directly — the CLI doesn't support `ignore` or +`includeByDefault`, so we edit the config file: + +```bash +mkdir -p ~/.config/qmd +cat > ~/.config/qmd/index.yml <<'YAML' +collections: + wiki: + path: /Users/YOUR_USER/projects/wiki # ← replace with your actual path + pattern: "**/*.md" + ignore: + - "archive/**" + - "reports/**" + - "plans/**" + - "conversations/**" + - "scripts/**" + - "context/**" + + wiki-archive: + path: /Users/YOUR_USER/projects/wiki/archive + pattern: "**/*.md" + includeByDefault: false + + wiki-conversations: + path: /Users/YOUR_USER/projects/wiki/conversations + pattern: "**/*.md" + includeByDefault: false + ignore: + - "index.md" +YAML +``` + +On Linux/WSL, replace `/Users/YOUR_USER` with `/home/YOUR_USER`. + +Build the indexes: + +```bash +qmd update # scan files into all three collections +qmd embed # generate vector embeddings (~2 min first run + ~30 min for conversations on CPU) +``` + +Verify: + +```bash +qmd collection list +# Expected: +# wiki — N files +# wiki-archive — M files [excluded] +# wiki-conversations — K files [excluded] +``` + +The `[excluded]` tag on the non-default collections confirms +`includeByDefault: false` is honored. + +**When to query which**: + +```bash +# "What's the current pattern for X?" +qmd search "topic" --json -n 5 + +# "What was the OLD pattern, before we changed it?" +qmd search "topic" -c wiki-archive --json -n 5 + +# "When did we discuss this, and what did we decide?" +qmd search "topic" -c wiki-conversations --json -n 5 + +# Everything — history + current + conversations +qmd search "topic" -c wiki -c wiki-archive -c wiki-conversations --json -n 10 +``` + +--- + +## 4. Configure the Python scripts + +Three scripts need per-user configuration: + +### `scripts/extract-sessions.py` — `PROJECT_MAP` + +This maps Claude Code project directory suffixes to short wiki codes +("wings"). Claude stores sessions under `~/.claude/projects//` +where the hashed path is derived from the absolute path to your project. + +Open the script and edit the `PROJECT_MAP` dict near the top. Look for +the `CONFIGURE ME` block. Examples: + +```python +PROJECT_MAP: dict[str, str] = { + "projects-wiki": "wiki", + "-claude": "cl", + "my-webapp": "web", # map "mydir/my-webapp" → wing "web" + "mobile-app": "mob", + "work-monorepo": "work", + "-home": "general", # catch-all for unmatched sessions +} +``` + +Run `ls ~/.claude/projects/` to see what directory names Claude is +actually producing on your machine — the suffix in `PROJECT_MAP` matches +against the end of each directory name. + +### `scripts/update-conversation-index.py` — `PROJECT_NAMES` / `PROJECT_ORDER` + +Matching display names for every code in `PROJECT_MAP`: + +```python +PROJECT_NAMES: dict[str, str] = { + "wiki": "WIKI — This Wiki", + "cl": "CL — Claude Config", + "web": "WEB — My Webapp", + "mob": "MOB — Mobile App", + "work": "WORK — Day Job", + "general": "General — Cross-Project", +} + +PROJECT_ORDER = [ + "work", "web", "mob", # most-active first + "wiki", "cl", "general", +] +``` + +### `scripts/wiki-harvest.py` — `SKIP_DOMAIN_PATTERNS` + +Add your internal/personal domains so the harvester doesn't try to fetch +them. Patterns use `re.search`: + +```python +SKIP_DOMAIN_PATTERNS = [ + # ... (generic ones are already there) + r"\.mycompany\.com$", + r"^git\.mydomain\.com$", +] +``` + +--- + +## 5. Create the post-merge hook + +The hook rebuilds the qmd index automatically after every `git pull`: + +```bash +cat > ~/projects/wiki/.git/hooks/post-merge <<'HOOK' +#!/usr/bin/env bash +set -euo pipefail + +if command -v qmd &>/dev/null; then + echo "wiki: rebuilding qmd index..." + qmd update 2>/dev/null + # WSL / Linux: no GPU, force CPU-only embeddings + if [[ "$(uname -s)" == "Linux" ]]; then + NODE_LLAMA_CPP_GPU=false qmd embed 2>/dev/null + else + qmd embed 2>/dev/null + fi + echo "wiki: qmd index updated" +fi +HOOK +chmod +x ~/projects/wiki/.git/hooks/post-merge +``` + +`.git/hooks/` isn't tracked by git, so this step runs on every machine +where you clone the repo. + +--- + +## 6. Backfill frontmatter (first-time setup or fresh clone) + +If you're starting with existing wiki pages that don't yet have +`last_verified` or `origin`, backfill them: + +```bash +cd ~/projects/wiki + +# Backfill last_verified from last_compiled/git/mtime +python3 scripts/wiki-hygiene.py --backfill + +# Backfill origin: manual on pre-automation pages (one-shot inline) +python3 -c " +import sys +sys.path.insert(0, 'scripts') +from wiki_lib import iter_live_pages, write_page +changed = 0 +for p in iter_live_pages(): + if 'origin' not in p.frontmatter: + p.frontmatter['origin'] = 'manual' + write_page(p) + changed += 1 +print(f'{changed} page(s) backfilled') +" +``` + +For a brand-new empty wiki, there's nothing to backfill — skip this step. + +--- + +## 7. Run the pipeline manually once + +Before setting up cron, do a full end-to-end dry run to make sure +everything's wired up: + +```bash +cd ~/projects/wiki + +# 1. Extract any existing Claude Code sessions +bash scripts/mine-conversations.sh --extract-only + +# 2. Summarize with claude -p (will make real LLM calls — can take minutes) +python3 scripts/summarize-conversations.py --claude + +# 3. Regenerate conversation index + wake-up context +python3 scripts/update-conversation-index.py --reindex + +# 4. Dry-run the maintenance pipeline +bash scripts/wiki-maintain.sh --dry-run --no-compile +``` + +Expected output from step 4: all three phases run, phase 3 (qmd reindex) +shows as skipped in dry-run mode, and you see `finished in Ns`. + +--- + +## 8. Cron setup (optional) + +If you want full automation, add these cron jobs. **Run them on only ONE +machine** — state files sync via git, so the other machine picks up the +results automatically. + +```bash +crontab -e +``` + +```cron +# Wiki SSH key for cron (if your remote uses SSH with a key) +GIT_SSH_COMMAND="ssh -i /path/to/wiki-key -o StrictHostKeyChecking=no" + +# PATH for cron so claude, qmd, node, python3, pipx tools are findable +PATH=/home/YOUR_USER/.nvm/versions/node/v22/bin:/home/YOUR_USER/.local/bin:/usr/local/bin:/usr/bin:/bin + +# ─── Sync ────────────────────────────────────────────────────────────────── +# commit/pull/push every 15 minutes +*/15 * * * * /home/YOUR_USER/projects/wiki/scripts/wiki-sync.sh --commit && /home/YOUR_USER/projects/wiki/scripts/wiki-sync.sh --pull && /home/YOUR_USER/projects/wiki/scripts/wiki-sync.sh --push >> /tmp/wiki-sync.log 2>&1 + +# full sync with qmd reindex every 2 hours +0 */2 * * * /home/YOUR_USER/projects/wiki/scripts/wiki-sync.sh full >> /tmp/wiki-sync.log 2>&1 + +# ─── Mining ──────────────────────────────────────────────────────────────── +# Extract new sessions hourly (no LLM, fast) +0 * * * * /home/YOUR_USER/projects/wiki/scripts/mine-conversations.sh --extract-only >> /tmp/wiki-mine.log 2>&1 + +# Summarize + index daily at 2am (uses claude -p) +0 2 * * * cd /home/YOUR_USER/projects/wiki && python3 scripts/summarize-conversations.py --claude >> /tmp/wiki-mine.log 2>&1 && python3 scripts/update-conversation-index.py --reindex >> /tmp/wiki-mine.log 2>&1 + +# ─── Maintenance ─────────────────────────────────────────────────────────── +# Daily at 3am: harvest + quick hygiene + qmd reindex +0 3 * * * cd /home/YOUR_USER/projects/wiki && bash scripts/wiki-maintain.sh >> scripts/.maintain.log 2>&1 + +# Weekly Sunday at 4am: full hygiene with LLM checks +0 4 * * 0 cd /home/YOUR_USER/projects/wiki && bash scripts/wiki-maintain.sh --hygiene-only --full >> scripts/.maintain.log 2>&1 +``` + +Replace `YOUR_USER` and the node path as appropriate for your system. + +**macOS note**: `cron` needs Full Disk Access if you're pointing it at +files in `~/Documents` or `~/Desktop`. Alternatively use `launchd` with +a plist — same effect, easier permission model on macOS. + +**WSL note**: make sure `cron` is actually running (`sudo service cron +start`). Cron doesn't auto-start in WSL by default. + +**`claude -p` in cron**: OAuth tokens must be cached before cron runs it. +Run `claude --version` once interactively as your user to prime the +token cache — cron then picks up the cached credentials. + +--- + +## 9. Tell Claude Code about the wiki + +Two separate CLAUDE.md files work together: + +1. **The wiki's own `CLAUDE.md`** at `~/projects/wiki/CLAUDE.md` — the + schema the agent reads when working INSIDE the wiki. Tells it how to + maintain pages, apply frontmatter, handle staging/archival. +2. **Your global `~/.claude/CLAUDE.md`** — the user-level instructions + the agent reads on EVERY session (regardless of directory). Tells it + when and how to consult the wiki from any other project. + +Both are provided as starter templates you can copy and adapt: + +### (a) Wiki schema — copy to the wiki root + +```bash +cp ~/projects/wiki/docs/examples/wiki-CLAUDE.md ~/projects/wiki/CLAUDE.md +# then edit ~/projects/wiki/CLAUDE.md for your own conventions +``` + +This file is ~200 lines. It defines: +- Directory structure and the automated-vs-manual core rule +- Frontmatter spec (required fields, staging fields, archive fields) +- Page-type conventions (pattern / decision / environment / concept) +- Operations: Ingest, Query, Mine, Harvest, Maintain, Lint +- **Search Strategy** — which of the three qmd collections to use for + which question type + +Customize the sections marked **"Customization Notes"** at the bottom +for your own categories, environments, and cross-reference format. + +### (b) Global wake-up + query instructions + +Append the contents of `docs/examples/global-CLAUDE.md` to your global +Claude Code instructions: + +```bash +cat ~/projects/wiki/docs/examples/global-CLAUDE.md >> ~/.claude/CLAUDE.md +# then review ~/.claude/CLAUDE.md to integrate cleanly with any existing +# content +``` + +This adds: +- **Wake-Up Context** — read `context/wake-up.md` at session start +- **LLM Wiki — When to Consult It** — query mode vs ingest mode rules +- **LLM Wiki — How to Search It** — explicit guidance for all three qmd + collections (`wiki`, `wiki-archive`, `wiki-conversations`) with + example queries for each +- **Rules When Citing** — flag `confidence: low`, `status: pending`, + and archived pages to the user + +Together these give the agent a complete picture: how to maintain the +wiki when working inside it, and how to consult it from anywhere else. + +--- + +## 10. Verify + +```bash +cd ~/projects/wiki + +# Sync state +bash scripts/wiki-sync.sh --status + +# Search +qmd collection list +qmd search "test" --json -n 3 # won't return anything if wiki is empty + +# Mining +tail -20 scripts/.mine.log 2>/dev/null || echo "(no mining runs yet)" + +# End-to-end maintenance dry-run (no writes, no LLM, no network) +bash scripts/wiki-maintain.sh --dry-run --no-compile + +# Run the test suite +cd tests && python3 -m pytest +``` + +Expected: +- `qmd collection list` shows all three collections: `wiki`, `wiki-archive [excluded]`, `wiki-conversations [excluded]` +- `wiki-maintain.sh --dry-run` completes all three phases +- `pytest` passes all 171 tests in ~1.3 seconds + +--- + +## Troubleshooting + +**qmd search returns nothing** +```bash +qmd collection list # verify path points at the right place +qmd update # rebuild index +qmd embed # rebuild embeddings +cat ~/.config/qmd/index.yml # verify config is correct for your machine +``` + +**qmd collection points at the wrong path** +Edit `~/.config/qmd/index.yml` directly. Don't use `qmd collection add` +from inside the target directory — it can interpret the path oddly. + +**qmd returns archived pages in default searches** +Verify `wiki-archive` has `includeByDefault: false` in the YAML and +`qmd collection list` shows `[excluded]`. + +**`claude -p` fails in cron ("not authenticated")** +Cron has no browser. Run `claude --version` once as the same user +outside cron to cache OAuth tokens; cron will pick them up. Also verify +the `PATH` directive at the top of the crontab includes the directory +containing `claude`. + +**`wiki-harvest.py` fetch failures** +```bash +# Verify the extraction tools work +trafilatura -u "https://example.com" --markdown --no-comments --precision +crwl "https://example.com" -o markdown-fit + +# Check harvest state +python3 -c "import json; print(json.dumps(json.load(open('.harvest-state.json'))['failed_urls'], indent=2))" +``` + +**`wiki-hygiene.py` archived a page unexpectedly** +Check `last_verified` vs decay thresholds. If the page was never +referenced in a conversation, it decayed naturally. Restore with: +```bash +python3 scripts/wiki-hygiene.py --restore archive/patterns/foo.md +``` + +**Both machines ran maintenance simultaneously** +Merge conflicts on `.harvest-state.json` / `.hygiene-state.json` will +occur. Pick ONE machine for maintenance; disable the maintenance cron +on the other. Leave sync cron running on both so changes still propagate. + +**Tests fail** +Run `cd tests && python3 -m pytest -v` for verbose output. If the +failure mentions `WIKI_DIR` or module loading, verify +`scripts/wiki_lib.py` exists and contains the `WIKI_DIR` env var override +near the top. + +--- + +## Minimal install (skip everything except the idea) + +If you want the conceptual wiki without any of the automation, all you +actually need is: + +1. An empty directory +2. `CLAUDE.md` telling your agent the conventions (see the schema in + [`ARCHITECTURE.md`](ARCHITECTURE.md) or Karpathy's gist) +3. `index.md` for the agent to catalog pages +4. An agent that can read and write files (any Claude Code, Cursor, Aider + session works) + +Then tell the agent: "Start maintaining a wiki here. Every time I share +a source, integrate it. When I ask a question, check the wiki first." + +You can bolt on the automation layer later if/when it becomes worth +the setup effort. diff --git a/docs/examples/global-CLAUDE.md b/docs/examples/global-CLAUDE.md new file mode 100644 index 0000000..3c69b25 --- /dev/null +++ b/docs/examples/global-CLAUDE.md @@ -0,0 +1,161 @@ +# Global Claude Code Instructions — Wiki Section + +**What this is**: Content to add to your global `~/.claude/CLAUDE.md` +(the user-level instructions Claude Code reads at the start of every +session, regardless of which project you're in). These instructions tell +Claude how to consult the wiki from outside the wiki directory. + +**Where to paste it**: Append these sections to `~/.claude/CLAUDE.md`. +Don't overwrite the whole file — this is additive. + +--- + +Copy everything below this line into your global `~/.claude/CLAUDE.md`: + +--- + +## Wake-Up Context + +At the start of each session, read `~/projects/wiki/context/wake-up.md` +for a briefing on active projects, recent decisions, and current +concerns. This provides conversation continuity across sessions. + +## LLM Wiki — When to Consult It + +**Before creating API endpoints, Docker configs, CI pipelines, or making +architectural decisions**, check the wiki at `~/projects/wiki/` for +established patterns and decisions. + +The wiki captures the **why** behind patterns — not just what to do, but +the reasoning, constraints, alternatives rejected, and environment- +specific differences. It compounds over time as projects discover new +knowledge. + +**When to read from the wiki** (query mode): +- Creating any operational endpoint (/health, /version, /status) +- Setting up secrets management in a new service +- Writing Dockerfiles or docker-compose configurations +- Configuring CI/CD pipelines +- Adding database users or migrations +- Making architectural decisions that should be consistent across projects + +**When to write back to the wiki** (ingest mode): +- When you discover something new that should apply across projects +- When a project reveals an exception or edge case to an existing pattern +- When a decision is made that future projects should follow +- When the human explicitly says "add this to the wiki" + +Human-initiated wiki writes go directly to the live wiki with +`origin: manual`. Script-initiated writes go through `staging/` first. +See the wiki's own `CLAUDE.md` for the full ingest protocol. + +## LLM Wiki — How to Search It + +Use the `qmd` CLI for fast, structured search. DO NOT read `index.md` +for large queries — it's only for full-catalog browsing. DO NOT grep the +wiki manually when `qmd` is available. + +The wiki has **three qmd collections**. Pick the right one for the +question: + +### Default collection: `wiki` (live content) + +For "what's our current pattern for X?" type questions. This is the +default — no `-c` flag needed. + +```bash +# Keyword search (fast, BM25) +qmd search "health endpoint version" --json -n 5 + +# Semantic search (finds conceptually related pages) +qmd vsearch "how should API endpoints be structured" --json -n 5 + +# Best quality — hybrid BM25 + vector + LLM re-ranking +qmd query "health endpoint" --json -n 5 + +# Then read the matched page +cat ~/projects/wiki/patterns/health-endpoints.md +``` + +### Archive collection: `wiki-archive` (stale / superseded) + +For "what was our OLD pattern before we changed it?" questions. This is +excluded from default searches; query explicitly with `-c wiki-archive`. + +```bash +# "Did we used to use Alpine? Why did we stop?" +qmd search "alpine" -c wiki-archive --json -n 5 + +# Semantic search across archive +qmd vsearch "container base image considerations" -c wiki-archive --json -n 5 +``` + +When you cite content from an archived page, tell the user it's +archived and may be outdated. + +### Conversations collection: `wiki-conversations` (mined session transcripts) + +For "when did we discuss this, and what did we decide?" questions. This +is the mined history of your actual Claude Code sessions — decisions, +debugging breakthroughs, design discussions. Excluded from default +searches because transcripts would flood results. + +```bash +# "When did we decide to use staging?" +qmd search "staging review workflow" -c wiki-conversations --json -n 5 + +# "What debugging did we do around Docker networking?" +qmd vsearch "docker network conflicts" -c wiki-conversations --json -n 5 +``` + +Useful for: +- Tracing the reasoning behind a decision back to the session where it + was made +- Finding a solution to a problem you remember solving but didn't write + up +- Context-gathering when returning to a project after time away + +### Searching across all collections + +Rarely needed, but for "find everything on this topic across time": + +```bash +qmd search "topic" -c wiki -c wiki-archive -c wiki-conversations --json -n 10 +``` + +## LLM Wiki — Rules When Citing + +1. **Always use `--json`** for structured qmd output. Never try to parse + prose. +2. **Flag `confidence: low` pages** to the user when citing. The content + may be aging out. +3. **Flag `status: pending` pages** (in `staging/`) as unverified when + citing: "Note: this is from a pending wiki page that has not been + human-reviewed yet." +4. **Flag archived pages** as "archived and may be outdated" when citing. +5. **Use `index.md` for browsing only**, not for targeted lookups. `qmd` + is faster and more accurate. +6. **Prefer semantic search for conceptual queries**, keyword search for + specific names/terms. + +## LLM Wiki — Quick Reference + +- `~/projects/wiki/CLAUDE.md` — Full wiki schema and operations (read this when working IN the wiki) +- `~/projects/wiki/index.md` — Content catalog (browse the full wiki) +- `~/projects/wiki/patterns/` — How things should be built +- `~/projects/wiki/decisions/` — Why we chose this approach +- `~/projects/wiki/environments/` — Where environments differ +- `~/projects/wiki/concepts/` — Foundational ideas +- `~/projects/wiki/raw/` — Immutable source material (never modify) +- `~/projects/wiki/staging/` — Pending automated content (flag when citing) +- `~/projects/wiki/archive/` — Stale content (flag when citing) +- `~/projects/wiki/conversations/` — Session history (search via `-c wiki-conversations`) + +--- + +**End of additions for `~/.claude/CLAUDE.md`.** + +See also the wiki's own `CLAUDE.md` at the wiki root — that file tells +the agent how to *maintain* the wiki when working inside it. This file +(the global one) tells the agent how to *consult* the wiki from anywhere +else. diff --git a/docs/examples/wiki-CLAUDE.md b/docs/examples/wiki-CLAUDE.md new file mode 100644 index 0000000..654b189 --- /dev/null +++ b/docs/examples/wiki-CLAUDE.md @@ -0,0 +1,278 @@ +# LLM Wiki — Schema + +This is a persistent, compounding knowledge base maintained by LLM agents. +It captures the **why** behind patterns, decisions, and implementations — +not just the what. Copy this file to the root of your wiki directory +(i.e. `~/projects/wiki/CLAUDE.md`) and edit for your own conventions. + +> This is an example `CLAUDE.md` for the wiki root. The agent reads this +> at the start of every session when working inside the wiki. It's the +> "constitution" that tells the agent how to maintain the knowledge base. + +## How This Wiki Works + +**You are the maintainer.** When working in this wiki directory, you read +raw sources, compile knowledge into wiki pages, maintain cross-references, +and keep everything consistent. + +**You are a consumer.** When working in any other project directory, you +read wiki pages to inform your work — applying established patterns, +respecting decisions, and understanding context. + +## Directory Structure + +``` +wiki/ +├── CLAUDE.md ← You are here (schema) +├── index.md ← Content catalog — read this FIRST on any query +├── log.md ← Chronological record of all operations +│ +├── patterns/ ← LIVE: HOW things should be built (with WHY) +├── decisions/ ← LIVE: WHY we chose this approach (with alternatives rejected) +├── environments/ ← LIVE: WHERE implementations differ +├── concepts/ ← LIVE: WHAT the foundational ideas are +│ +├── raw/ ← Immutable source material (NEVER modify) +│ └── harvested/ ← URL harvester output +│ +├── staging/ ← PENDING automated content awaiting human review +│ ├── index.md +│ └── / +│ +├── archive/ ← STALE / superseded (excluded from default search) +│ ├── index.md +│ └── / +│ +├── conversations/ ← Mined Claude Code session transcripts +│ ├── index.md +│ └── / ← per-project or per-person (MemPalace "wing") +│ +├── context/ ← Auto-updated AI session briefing +│ ├── wake-up.md ← Loaded at the start of every session +│ └── active-concerns.md +│ +├── reports/ ← Hygiene operation logs +└── scripts/ ← The automation pipeline +``` + +**Core rule — automated vs manual content**: + +| Origin | Destination | Status | +|--------|-------------|--------| +| Script-generated (harvester, hygiene, URL compile) | `staging/` | `pending` | +| Human-initiated ("add this to the wiki" in a Claude session) | Live wiki (`patterns/`, etc.) | `verified` | +| Human-reviewed from staging | Live wiki (promoted) | `verified` | + +Managed via `scripts/wiki-staging.py --list / --promote / --reject / --review`. + +## Page Conventions + +### Frontmatter (required on all wiki pages) + +```yaml +--- +title: Page Title +type: pattern | decision | environment | concept +confidence: high | medium | low +origin: manual | automated # How the page entered the wiki +sources: [list of raw/ files this was compiled from] +related: [list of other wiki pages this connects to] +last_compiled: YYYY-MM-DD # Date this page was last (re)compiled from sources +last_verified: YYYY-MM-DD # Date the content was last confirmed accurate +--- +``` + +**`origin` values**: +- `manual` — Created by a human in a Claude session. Goes directly to the live wiki, no staging. +- `automated` — Created by a script (harvester, hygiene, etc.). Must pass through `staging/` for human review before promotion. + +**Confidence decay**: Pages with no refresh signal for 6 months decay `high → medium`; 9 months → `low`; 12 months → `stale` (auto-archived). `last_verified` drives decay, not `last_compiled`. See `scripts/wiki-hygiene.py` and `archive/index.md`. + +### Staging Frontmatter (pages in `staging//`) + +Automated-origin pages get additional staging metadata that is **stripped on promotion**: + +```yaml +--- +title: ... +type: ... +origin: automated +status: pending # Awaiting review +staged_date: YYYY-MM-DD # When the automated script staged this +staged_by: wiki-harvest # Which script staged it (wiki-harvest, wiki-hygiene, ...) +target_path: patterns/foo.md # Where it should land on promotion +modifies: patterns/bar.md # Only present when this is an update to an existing live page +compilation_notes: "..." # AI's explanation of what it did and why +harvest_source: https://... # Only present for URL-harvested content +sources: [...] +related: [...] +last_verified: YYYY-MM-DD +--- +``` + +### Pattern Pages (`patterns/`) + +Structure: +1. **What** — One-paragraph description of the pattern +2. **Why** — The reasoning, constraints, and goals that led to this pattern +3. **Canonical Example** — A concrete implementation (link to raw/ source or inline) +4. **Structure** — The specification: fields, endpoints, formats, conventions +5. **When to Deviate** — Known exceptions or conditions where the pattern doesn't apply +6. **History** — Key changes and the decisions that drove them + +### Decision Pages (`decisions/`) + +Structure: +1. **Decision** — One sentence: what we decided +2. **Context** — What problem or constraint prompted this +3. **Options Considered** — What alternatives existed (with pros/cons) +4. **Rationale** — Why this option won +5. **Consequences** — What this decision enables and constrains +6. **Status** — Active | Superseded by [link] | Under Review + +### Environment Pages (`environments/`) + +Structure: +1. **Overview** — What this environment is (platform, CI, infra) +2. **Key Differences** — Table comparing environments for this domain +3. **Implementation Details** — Environment-specific configs, credentials, deploy method +4. **Gotchas** — Things that have bitten us + +### Concept Pages (`concepts/`) + +Structure: +1. **Definition** — What this concept means in our context +2. **Why It Matters** — How this concept shapes our decisions +3. **Related Patterns** — Links to patterns that implement this concept +4. **Related Decisions** — Links to decisions driven by this concept + +## Operations + +### Ingest (adding new knowledge) + +When a new raw source is added or you learn something new: + +1. Read the source material thoroughly +2. Identify which existing wiki pages need updating +3. Identify if new pages are needed +4. Update/create pages following the conventions above +5. Update cross-references (`related:` frontmatter) on all affected pages +6. Update `index.md` with any new pages +7. Set `last_verified:` to today's date on every page you create or update +8. Set `origin: manual` on any page you create when a human directed you to +9. Append to `log.md`: `## [YYYY-MM-DD] ingest | Source Description` + +**Where to write**: +- **Human-initiated** ("add this to the wiki", "create a pattern for X") — write directly to the live directory (`patterns/`, `decisions/`, etc.) with `origin: manual`. The human's instruction IS the approval. +- **Script-initiated** (harvest, auto-compile, hygiene auto-fix) — write to `staging//` with `origin: automated`, `status: pending`, plus `staged_date`, `staged_by`, `target_path`, and `compilation_notes`. For updates to existing live pages, also set `modifies: `. + +### Query (answering questions from other projects) + +When working in another project and consulting the wiki: + +1. Use `qmd` to search first (see Search Strategy below). Read `index.md` only when browsing the full catalog. +2. Read the specific pattern/decision/concept pages +3. Apply the knowledge, respecting environment differences +4. If a page's `confidence` is `low`, flag that to the user — the content may be aging out +5. If a page has `status: pending` (it's in `staging/`), flag that to the user: "Note: this is from a pending wiki page in staging, not yet verified." Use the content but make the uncertainty visible. +6. If you find yourself consulting a page under `archive/`, mention it's archived and may be outdated +7. If your work reveals new knowledge, **file it back** — update the wiki (and bump `last_verified`) + +### Search Strategy — which qmd collection to use + +The wiki has three qmd collections. Pick the right one for the question: + +| Question type | Collection | Command | +|---|---|---| +| "What's our current pattern for X?" | `wiki` (default) | `qmd search "X" --json -n 5` | +| "What's the rationale behind decision Y?" | `wiki` (default) | `qmd vsearch "why did we choose Y" --json -n 5` | +| "What was our OLD approach before we changed it?" | `wiki-archive` | `qmd search "X" -c wiki-archive --json -n 5` | +| "When did we discuss this, and what did we decide?" | `wiki-conversations` | `qmd search "X" -c wiki-conversations --json -n 5` | +| "Find everything across time" | all three | `qmd search "X" -c wiki -c wiki-archive -c wiki-conversations --json -n 10` | + +**Rules of thumb**: +- Use `qmd search` for keyword matches (BM25, fast) +- Use `qmd vsearch` for conceptual / semantically-similar queries (vector) +- Use `qmd query` for the best quality — hybrid BM25 + vector + LLM re-ranking +- Always use `--json` for structured output +- Read individual matched pages with `cat` or your file tool after finding them + +### Mine (conversation extraction and summarization) + +Four-phase pipeline that extracts sessions into searchable conversation pages: + +1. **Extract** (`extract-sessions.py`) — Parse session files into markdown transcripts +2. **Summarize** (`summarize-conversations.py --claude`) — Classify + summarize via `claude -p` with haiku/sonnet routing +3. **Index** (`update-conversation-index.py --reindex`) — Regenerate conversation index + `context/wake-up.md` +4. **Harvest** (`wiki-harvest.py`) — Scan summarized conversations for external reference URLs and compile them into wiki pages + +Full pipeline via `mine-conversations.sh`. Extraction is incremental (tracks byte offsets). Summarization is incremental (tracks message count). + +### Maintain (wiki health automation) + +`scripts/wiki-maintain.sh` chains harvest + hygiene + qmd reindex: + +```bash +bash scripts/wiki-maintain.sh # Harvest + quick hygiene + reindex +bash scripts/wiki-maintain.sh --full # Harvest + full hygiene (LLM) + reindex +bash scripts/wiki-maintain.sh --harvest-only # Harvest only +bash scripts/wiki-maintain.sh --hygiene-only # Hygiene only +bash scripts/wiki-maintain.sh --dry-run # Show what would run +``` + +### Lint (periodic health check) + +Automated via `scripts/wiki-hygiene.py`. Two tiers: + +**Quick mode** (no LLM, run daily — `python3 scripts/wiki-hygiene.py`): +- Backfill missing `last_verified` +- Refresh `last_verified` from conversation `related:` references +- Auto-restore archived pages that are referenced again +- Repair frontmatter (missing required fields, invalid values) +- Confidence decay per 6/9/12-month thresholds +- Archive stale and superseded pages +- Orphan pages (auto-linked into `index.md`) +- Broken cross-references (fuzzy-match fix via `difflib`, or restore from archive) +- Main index drift (auto add missing entries, remove stale ones) +- Empty stubs (report-only) +- State file drift (report-only) +- Staging/archive index resync + +**Full mode** (LLM, run weekly — `python3 scripts/wiki-hygiene.py --full`): +- Everything in quick mode, plus: +- Missing cross-references between related pages (haiku) +- Duplicate coverage — weaker page auto-merged into stronger (sonnet) +- Contradictions between pages (sonnet, report-only) +- Technology lifecycle — flag pages referencing versions older than what's in recent conversations + +**Reports** (written to `reports/`): +- `hygiene-YYYY-MM-DD-fixed.md` — what was auto-fixed +- `hygiene-YYYY-MM-DD-needs-review.md` — what needs human judgment + +## Cross-Reference Conventions + +- Link between wiki pages using relative markdown links: `[Pattern Name](../patterns/file.md)` +- Link to raw sources: `[Source](../raw/path/to/file.md)` +- In frontmatter `related:` use the relative filename: `patterns/secrets-at-startup.md` + +## Naming Conventions + +- Filenames: `kebab-case.md` +- Patterns: named by what they standardize (e.g., `health-endpoints.md`, `secrets-at-startup.md`) +- Decisions: named by what was decided (e.g., `no-alpine.md`, `dhi-base-images.md`) +- Environments: named by domain (e.g., `docker-registries.md`, `ci-cd-platforms.md`) +- Concepts: named by the concept (e.g., `two-user-database-model.md`, `build-once-deploy-many.md`) + +## Customization Notes + +Things you should change for your own wiki: + +1. **Directory structure** — the four live dirs (`patterns/`, `decisions/`, `concepts/`, `environments/`) reflect engineering use cases. Pick categories that match how you think — research wikis might use `findings/`, `hypotheses/`, `methods/`, `literature/` instead. Update `LIVE_CONTENT_DIRS` in `scripts/wiki_lib.py` to match. + +2. **Page page-type sections** — the "Structure" blocks under each page type are for my use. Define your own conventions. + +3. **`status` field** — if you want to track Superseded/Active/Under Review explicitly, this is a natural add. The hygiene script already checks for `status: Superseded by ...` and archives those automatically. + +4. **Environment Detection** — if you don't have multiple environments, remove the section. If you do, update it for your own environments (work/home, dev/prod, mac/linux, etc.). + +5. **Cross-reference path format** — I use `patterns/foo.md` in the `related:` field. Obsidian users might prefer `[[foo]]` wikilink format. The hygiene script handles standard markdown links; adapt as needed. diff --git a/scripts/extract-sessions.py b/scripts/extract-sessions.py new file mode 100755 index 0000000..a3023da --- /dev/null +++ b/scripts/extract-sessions.py @@ -0,0 +1,810 @@ +#!/usr/bin/env python3 +"""Extract Claude Code session JSONL files into clean markdown transcripts. + +Phase A of the conversation mining pipeline. Deterministic, no LLM dependency. +Handles incremental extraction via byte offset tracking for sessions that span +hours or days. + +Usage: + python3 extract-sessions.py # Extract all new sessions + python3 extract-sessions.py --project mc # Extract one project + python3 extract-sessions.py --session 0a543572 # Extract specific session + python3 extract-sessions.py --dry-run # Show what would be extracted +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +CLAUDE_PROJECTS_DIR = Path(os.environ.get("CLAUDE_PROJECTS_DIR", str(Path.home() / ".claude" / "projects"))) +WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki"))) +CONVERSATIONS_DIR = WIKI_DIR / "conversations" +MINE_STATE_FILE = WIKI_DIR / ".mine-state.json" + +# ════════════════════════════════════════════════════════════════════════════ +# CONFIGURE ME — Map Claude project directory suffixes to wiki project codes +# ════════════════════════════════════════════════════════════════════════════ +# +# Claude Code stores sessions under ~/.claude/projects//. The +# directory name is derived from the absolute path of your project, so it +# looks like `-Users-alice-projects-myapp` or `-home-alice-projects-myapp`. +# +# This map tells the extractor which suffix maps to which short wiki code +# (the "wing"). More specific suffixes should appear first — the extractor +# picks the first match. Everything unmatched goes into `general/`. +# +# Examples — replace with your own projects: +PROJECT_MAP: dict[str, str] = { + # More specific suffixes first + "projects-wiki": "wiki", # this wiki itself + "-claude": "cl", # ~/.claude config repo + # Add your real projects here: + # "my-webapp": "web", + # "my-mobile-app": "mob", + # "work-mono-repo": "work", + # Catch-all — Claude sessions outside any tracked project + "-home": "general", + "-Users": "general", +} + +# Tool call names to keep full output for +KEEP_FULL_OUTPUT_TOOLS = {"Bash", "Skill"} + +# Tool call names to summarize (just note what was accessed) +SUMMARIZE_TOOLS = {"Read", "Glob", "Grep"} + +# Tool call names to keep with path + change summary +KEEP_CHANGE_TOOLS = {"Edit", "Write"} + +# Tool call names to keep description + result summary +KEEP_SUMMARY_TOOLS = {"Agent"} + +# Max lines of Bash output to keep +MAX_BASH_OUTPUT_LINES = 200 + +# --------------------------------------------------------------------------- +# State management +# --------------------------------------------------------------------------- + + +def load_state() -> dict[str, Any]: + """Load mining state from .mine-state.json.""" + if MINE_STATE_FILE.exists(): + with open(MINE_STATE_FILE) as f: + return json.load(f) + return {"sessions": {}, "last_run": None} + + +def save_state(state: dict[str, Any]) -> None: + """Save mining state to .mine-state.json.""" + state["last_run"] = datetime.now(timezone.utc).isoformat() + with open(MINE_STATE_FILE, "w") as f: + json.dump(state, f, indent=2) + + +# --------------------------------------------------------------------------- +# Project mapping +# --------------------------------------------------------------------------- + + +def resolve_project_code(dir_name: str) -> str | None: + """Map a Claude project directory name to a wiki project code. + + Directory names look like: -Users-alice-projects-myapp or -home-alice-projects-myapp + """ + for suffix, code in PROJECT_MAP.items(): + if dir_name.endswith(suffix): + return code + return None + + +def discover_sessions( + project_filter: str | None = None, + session_filter: str | None = None, +) -> list[dict[str, Any]]: + """Discover JSONL session files from Claude projects directory.""" + sessions = [] + + if not CLAUDE_PROJECTS_DIR.exists(): + print(f"Claude projects directory not found: {CLAUDE_PROJECTS_DIR}", file=sys.stderr) + return sessions + + for proj_dir in sorted(CLAUDE_PROJECTS_DIR.iterdir()): + if not proj_dir.is_dir(): + continue + + code = resolve_project_code(proj_dir.name) + if code is None: + continue + + if project_filter and code != project_filter: + continue + + for jsonl_file in sorted(proj_dir.glob("*.jsonl")): + session_id = jsonl_file.stem + if session_filter and not session_id.startswith(session_filter): + continue + + sessions.append({ + "session_id": session_id, + "project": code, + "jsonl_path": jsonl_file, + "file_size": jsonl_file.stat().st_size, + }) + + return sessions + + +# --------------------------------------------------------------------------- +# JSONL parsing and filtering +# --------------------------------------------------------------------------- + + +def extract_timestamp(obj: dict[str, Any]) -> str | None: + """Get timestamp from a JSONL record.""" + ts = obj.get("timestamp") + if isinstance(ts, str): + return ts + if isinstance(ts, (int, float)): + return datetime.fromtimestamp(ts / 1000, tz=timezone.utc).isoformat() + return None + + +def extract_session_date(obj: dict[str, Any]) -> str: + """Get date string (YYYY-MM-DD) from a JSONL record timestamp.""" + ts = extract_timestamp(obj) + if ts: + try: + dt = datetime.fromisoformat(ts.replace("Z", "+00:00")) + return dt.strftime("%Y-%m-%d") + except (ValueError, TypeError): + pass + return datetime.now(timezone.utc).strftime("%Y-%m-%d") + + +def truncate_lines(text: str, max_lines: int) -> str: + """Truncate text to max_lines, adding a note if truncated.""" + lines = text.splitlines() + if len(lines) <= max_lines: + return text + kept = lines[:max_lines] + omitted = len(lines) - max_lines + kept.append(f"\n[... {omitted} lines truncated ...]") + return "\n".join(kept) + + +def format_tool_use(name: str, input_data: dict[str, Any]) -> str | None: + """Format a tool_use content block for the transcript.""" + if name in KEEP_FULL_OUTPUT_TOOLS: + if name == "Bash": + cmd = input_data.get("command", "") + desc = input_data.get("description", "") + label = desc if desc else cmd[:100] + return f"**[Bash]**: `{label}`" + if name == "Skill": + skill = input_data.get("skill", "") + args = input_data.get("args", "") + return f"**[Skill]**: /{skill} {args}".strip() + + if name in SUMMARIZE_TOOLS: + if name == "Read": + fp = input_data.get("file_path", "?") + return f"[Read: {fp}]" + if name == "Glob": + pattern = input_data.get("pattern", "?") + return f"[Glob: {pattern}]" + if name == "Grep": + pattern = input_data.get("pattern", "?") + path = input_data.get("path", "") + return f"[Grep: '{pattern}' in {path}]" if path else f"[Grep: '{pattern}']" + + if name in KEEP_CHANGE_TOOLS: + if name == "Edit": + fp = input_data.get("file_path", "?") + old = input_data.get("old_string", "")[:60] + return f"**[Edit]**: {fp} — replaced '{old}...'" + if name == "Write": + fp = input_data.get("file_path", "?") + content_len = len(input_data.get("content", "")) + return f"**[Write]**: {fp} ({content_len} chars)" + + if name in KEEP_SUMMARY_TOOLS: + if name == "Agent": + desc = input_data.get("description", "?") + return f"**[Agent]**: {desc}" + + if name == "ToolSearch": + return None # noise + if name == "TaskCreate": + subj = input_data.get("subject", "?") + return f"[TaskCreate: {subj}]" + if name == "TaskUpdate": + tid = input_data.get("taskId", "?") + status = input_data.get("status", "?") + return f"[TaskUpdate: #{tid} → {status}]" + + # Default: note the tool was called + return f"[{name}]" + + +def format_tool_result( + tool_name: str | None, + content: Any, + is_error: bool = False, +) -> str | None: + """Format a tool_result content block for the transcript.""" + text = "" + if isinstance(content, str): + text = content + elif isinstance(content, list): + parts = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + parts.append(item.get("text", "")) + text = "\n".join(parts) + + if not text.strip(): + return None + + if is_error: + return f"**[ERROR]**:\n```\n{truncate_lines(text, MAX_BASH_OUTPUT_LINES)}\n```" + + if tool_name in KEEP_FULL_OUTPUT_TOOLS: + return f"```\n{truncate_lines(text, MAX_BASH_OUTPUT_LINES)}\n```" + + if tool_name in SUMMARIZE_TOOLS: + # Just note the result size + line_count = len(text.splitlines()) + char_count = len(text) + return f"[→ {line_count} lines, {char_count} chars]" + + if tool_name in KEEP_CHANGE_TOOLS: + return None # The tool_use already captured what changed + + if tool_name in KEEP_SUMMARY_TOOLS: + # Keep a summary of agent results + summary = text[:300] + if len(text) > 300: + summary += "..." + return f"> {summary}" + + return None + + +def parse_content_blocks( + content: list[dict[str, Any]], + role: str, + tool_id_to_name: dict[str, str], +) -> list[str]: + """Parse content blocks from a message into transcript lines.""" + parts: list[str] = [] + + for block in content: + block_type = block.get("type") + + if block_type == "text": + text = block.get("text", "").strip() + if not text: + continue + # Skip system-reminder content + if "" in text: + # Strip system reminder tags and their content + text = re.sub( + r".*?", + "", + text, + flags=re.DOTALL, + ).strip() + # Skip local-command noise + if text.startswith(" tuple[list[str], dict[str, Any]]: + """Process a JSONL session file and return transcript lines + metadata. + + Args: + jsonl_path: Path to the JSONL file + byte_offset: Start reading from this byte position (for incremental) + + Returns: + Tuple of (transcript_lines, metadata_dict) + """ + transcript_lines: list[str] = [] + metadata: dict[str, Any] = { + "first_date": None, + "last_date": None, + "message_count": 0, + "human_messages": 0, + "assistant_messages": 0, + "git_branch": None, + "new_byte_offset": 0, + } + + # Map tool_use IDs to tool names for correlating results + tool_id_to_name: dict[str, str] = {} + + # Track when a command/skill was just invoked so the next user message + # (the skill prompt injection) gets labeled correctly + last_command_name: str | None = None + + with open(jsonl_path, "rb") as f: + if byte_offset > 0: + f.seek(byte_offset) + + for raw_line in f: + try: + obj = json.loads(raw_line) + except json.JSONDecodeError: + continue + + record_type = obj.get("type") + + # Skip non-message types + if record_type not in ("user", "assistant"): + continue + + msg = obj.get("message", {}) + role = msg.get("role", record_type) + content = msg.get("content", "") + + # Track metadata + date = extract_session_date(obj) + if metadata["first_date"] is None: + metadata["first_date"] = date + metadata["last_date"] = date + metadata["message_count"] += 1 + + if not metadata["git_branch"]: + metadata["git_branch"] = obj.get("gitBranch") + + if role == "user": + metadata["human_messages"] += 1 + elif role == "assistant": + metadata["assistant_messages"] += 1 + + # Process content + if isinstance(content, str): + text = content.strip() + # Skip system-reminder and local-command noise + if "" in text: + text = re.sub( + r".*?", + "", + text, + flags=re.DOTALL, + ).strip() + if text.startswith("/exit"): + continue + + # Detect command/skill invocation: /foo + cmd_match = re.search( + r"/([^<]+)", text, + ) + if cmd_match: + last_command_name = cmd_match.group(1) + # Keep just a brief note about the command invocation + transcript_lines.append( + f"**Human**: /{last_command_name}" + ) + transcript_lines.append("") + continue + + # Detect skill prompt injection (large structured text after a command) + if ( + last_command_name + and role == "user" + and len(text) > 500 + ): + # This is the skill's injected prompt — summarize it + transcript_lines.append( + f"[Skill prompt: /{last_command_name} — {len(text)} chars]" + ) + transcript_lines.append("") + last_command_name = None + continue + + # Also detect skill prompts by content pattern (catches cases + # where the command-name message wasn't separate, or where the + # prompt arrives without a preceding command-name tag) + if ( + role == "user" + and len(text) > 500 + and re.match( + r"^##\s*(Tracking|Step|Context|Instructions|Overview|Goal)", + text, + ) + ): + # Structured skill prompt — try to extract command name + cmd_in_text = re.search( + r'--command\s+"([^"]+)"', text, + ) + prompt_label = cmd_in_text.group(1) if cmd_in_text else (last_command_name or "unknown") + transcript_lines.append( + f"[Skill prompt: /{prompt_label} — {len(text)} chars]" + ) + transcript_lines.append("") + last_command_name = None + continue + + last_command_name = None # Reset after non-matching message + + if text: + label = "**Human**" if role == "user" else "**Assistant**" + transcript_lines.append(f"{label}: {text}") + transcript_lines.append("") + + elif isinstance(content, list): + # Check if this is a skill prompt in list form + is_skill_prompt = False + skill_prompt_name = last_command_name + if role == "user": + for block in content: + if block.get("type") == "text": + block_text = block.get("text", "").strip() + # Detect by preceding command name + if last_command_name and len(block_text) > 500: + is_skill_prompt = True + break + # Detect by content pattern (## Tracking, etc.) + if ( + len(block_text) > 500 + and re.match( + r"^##\s*(Tracking|Step|Context|Instructions|Overview|Goal)", + block_text, + ) + ): + is_skill_prompt = True + # Try to extract command name from content + cmd_in_text = re.search( + r'--command\s+"([^"]+)"', block_text, + ) + if cmd_in_text: + skill_prompt_name = cmd_in_text.group(1) + break + + if is_skill_prompt: + total_len = sum( + len(b.get("text", "")) + for b in content + if b.get("type") == "text" + ) + label = skill_prompt_name or "unknown" + transcript_lines.append( + f"[Skill prompt: /{label} — {total_len} chars]" + ) + transcript_lines.append("") + last_command_name = None + continue + + last_command_name = None + + parts = parse_content_blocks(content, role, tool_id_to_name) + if parts: + # Determine if this is a tool result message (user role but + # contains only tool_result blocks — these are tool outputs, + # not human input) + has_only_tool_results = all( + b.get("type") in ("tool_result",) + for b in content + if b.get("type") != "text" or b.get("text", "").strip() + ) and any(b.get("type") == "tool_result" for b in content) + + if has_only_tool_results: + # Tool results — no speaker label, just the formatted output + for part in parts: + transcript_lines.append(part) + elif role == "user": + # Check if there's actual human text (not just tool results) + has_human_text = any( + b.get("type") == "text" + and b.get("text", "").strip() + and "" not in b.get("text", "") + for b in content + ) + label = "**Human**" if has_human_text else "**Assistant**" + if len(parts) == 1: + transcript_lines.append(f"{label}: {parts[0]}") + else: + transcript_lines.append(f"{label}:") + for part in parts: + transcript_lines.append(part) + else: + label = "**Assistant**" + if len(parts) == 1: + transcript_lines.append(f"{label}: {parts[0]}") + else: + transcript_lines.append(f"{label}:") + for part in parts: + transcript_lines.append(part) + transcript_lines.append("") + + metadata["new_byte_offset"] = f.tell() + + return transcript_lines, metadata + + +# --------------------------------------------------------------------------- +# Markdown generation +# --------------------------------------------------------------------------- + + +def build_frontmatter( + session_id: str, + project: str, + date: str, + message_count: int, + git_branch: str | None = None, +) -> str: + """Build YAML frontmatter for a conversation markdown file.""" + lines = [ + "---", + f"title: Session {session_id[:8]}", + "type: conversation", + f"project: {project}", + f"date: {date}", + f"session_id: {session_id}", + f"messages: {message_count}", + "status: extracted", + ] + if git_branch: + lines.append(f"git_branch: {git_branch}") + lines.append("---") + return "\n".join(lines) + + +def write_new_conversation( + output_path: Path, + session_id: str, + project: str, + transcript_lines: list[str], + metadata: dict[str, Any], +) -> None: + """Write a new conversation markdown file.""" + date = metadata["first_date"] or datetime.now(timezone.utc).strftime("%Y-%m-%d") + frontmatter = build_frontmatter( + session_id=session_id, + project=project, + date=date, + message_count=metadata["message_count"], + git_branch=metadata.get("git_branch"), + ) + + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + f.write(frontmatter) + f.write("\n\n## Transcript\n\n") + f.write("\n".join(transcript_lines)) + f.write("\n") + + +def append_to_conversation( + output_path: Path, + transcript_lines: list[str], + new_message_count: int, +) -> None: + """Append new transcript content to an existing conversation file. + + Updates the message count in frontmatter and appends new transcript lines. + """ + content = output_path.read_text() + + # Update message count in frontmatter + content = re.sub( + r"^messages: \d+$", + f"messages: {new_message_count}", + content, + count=1, + flags=re.MULTILINE, + ) + + # Add last_updated + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + if "last_updated:" in content: + content = re.sub( + r"^last_updated: .+$", + f"last_updated: {today}", + content, + count=1, + flags=re.MULTILINE, + ) + else: + content = content.replace( + "\nstatus: extracted", + f"\nlast_updated: {today}\nstatus: extracted", + ) + + # Append new transcript + with open(output_path, "w") as f: + f.write(content) + if not content.endswith("\n"): + f.write("\n") + f.write("\n".join(transcript_lines)) + f.write("\n") + + +# --------------------------------------------------------------------------- +# Main extraction logic +# --------------------------------------------------------------------------- + + +def extract_session( + session_info: dict[str, Any], + state: dict[str, Any], + dry_run: bool = False, +) -> bool: + """Extract a single session. Returns True if work was done.""" + session_id = session_info["session_id"] + project = session_info["project"] + jsonl_path = session_info["jsonl_path"] + file_size = session_info["file_size"] + + # Check state for prior extraction + session_state = state["sessions"].get(session_id, {}) + last_offset = session_state.get("byte_offset", 0) + + # Skip if no new content + if file_size <= last_offset: + return False + + is_incremental = last_offset > 0 + + if dry_run: + mode = "append" if is_incremental else "new" + new_bytes = file_size - last_offset + print(f" [{mode}] {project}/{session_id[:8]} — {new_bytes:,} new bytes") + return True + + # Parse the JSONL + transcript_lines, metadata = process_jsonl(jsonl_path, byte_offset=last_offset) + + if not transcript_lines: + # Update offset even if no extractable content + state["sessions"][session_id] = { + "project": project, + "byte_offset": metadata["new_byte_offset"], + "message_count": session_state.get("message_count", 0), + "last_extracted": datetime.now(timezone.utc).isoformat(), + "summarized_through_msg": session_state.get("summarized_through_msg", 0), + } + return False + + # Determine output path + date = metadata["first_date"] or datetime.now(timezone.utc).strftime("%Y-%m-%d") + if is_incremental: + # Use existing output file + output_file = session_state.get("output_file", "") + output_path = WIKI_DIR / output_file if output_file else None + else: + output_path = None + + if output_path is None or not output_path.exists(): + filename = f"{date}-{session_id[:8]}.md" + output_path = CONVERSATIONS_DIR / project / filename + + # Write or append + total_messages = session_state.get("message_count", 0) + metadata["message_count"] + + if is_incremental and output_path.exists(): + append_to_conversation(output_path, transcript_lines, total_messages) + print(f" [append] {project}/{output_path.name} — +{metadata['message_count']} messages") + else: + write_new_conversation(output_path, session_id, project, transcript_lines, metadata) + print(f" [new] {project}/{output_path.name} — {metadata['message_count']} messages") + + # Update state + state["sessions"][session_id] = { + "project": project, + "output_file": str(output_path.relative_to(WIKI_DIR)), + "byte_offset": metadata["new_byte_offset"], + "message_count": total_messages, + "last_extracted": datetime.now(timezone.utc).isoformat(), + "summarized_through_msg": session_state.get("summarized_through_msg", 0), + } + + return True + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Extract Claude Code sessions into markdown transcripts", + ) + parser.add_argument( + "--project", + help="Only extract sessions for this project code (e.g., mc, if, lp)", + ) + parser.add_argument( + "--session", + help="Only extract this specific session (prefix match on session ID)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be extracted without writing files", + ) + parser.add_argument( + "--force", + action="store_true", + help="Re-extract from the beginning, ignoring saved byte offsets", + ) + args = parser.parse_args() + + state = load_state() + + if args.force: + # Reset all byte offsets + for sid in state["sessions"]: + state["sessions"][sid]["byte_offset"] = 0 + + # Discover sessions + sessions = discover_sessions( + project_filter=args.project, + session_filter=args.session, + ) + + if not sessions: + print("No sessions found matching filters.") + return + + print(f"Found {len(sessions)} session(s) to check...") + if args.dry_run: + print("DRY RUN — no files will be written\n") + + extracted = 0 + for session_info in sessions: + if extract_session(session_info, state, dry_run=args.dry_run): + extracted += 1 + + if extracted == 0: + print("No new content to extract.") + else: + print(f"\nExtracted {extracted} session(s).") + + if not args.dry_run: + save_state(state) + + +if __name__ == "__main__": + main() diff --git a/scripts/mine-conversations.sh b/scripts/mine-conversations.sh new file mode 100755 index 0000000..4f45778 --- /dev/null +++ b/scripts/mine-conversations.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +set -euo pipefail + +# mine-conversations.sh — Top-level orchestrator for conversation mining pipeline +# +# Chains: Extract (Python) → Summarize (llama.cpp) → Index (Python) +# +# Usage: +# mine-conversations.sh # Full pipeline +# mine-conversations.sh --extract-only # Phase A only (no LLM) +# mine-conversations.sh --summarize-only # Phase B only (requires llama-server) +# mine-conversations.sh --index-only # Phase C only +# mine-conversations.sh --project mc # Filter to one project +# mine-conversations.sh --dry-run # Show what would be done + +# Resolve script location first so sibling scripts are found regardless of WIKI_DIR +SCRIPTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WIKI_DIR="${WIKI_DIR:-$(dirname "${SCRIPTS_DIR}")}" +LOG_FILE="${SCRIPTS_DIR}/.mine.log" + +# --------------------------------------------------------------------------- +# Argument parsing +# --------------------------------------------------------------------------- + +EXTRACT=true +SUMMARIZE=true +INDEX=true +PROJECT="" +DRY_RUN="" +EXTRA_ARGS=() + +while [[ $# -gt 0 ]]; do + case "$1" in + --extract-only) + SUMMARIZE=false + INDEX=false + shift + ;; + --summarize-only) + EXTRACT=false + INDEX=false + shift + ;; + --index-only) + EXTRACT=false + SUMMARIZE=false + shift + ;; + --project) + PROJECT="$2" + shift 2 + ;; + --dry-run) + DRY_RUN="--dry-run" + shift + ;; + *) + EXTRA_ARGS+=("$1") + shift + ;; + esac +done + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +log() { + local msg + msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" + echo "${msg}" | tee -a "${LOG_FILE}" +} + +# --------------------------------------------------------------------------- +# Pipeline +# --------------------------------------------------------------------------- + +mkdir -p "${WIKI_DIR}/scripts" + +log "=== Conversation mining started ===" + +# Phase A: Extract +if [[ "${EXTRACT}" == true ]]; then + log "Phase A: Extracting sessions..." + local_args=() + if [[ -n "${PROJECT}" ]]; then + local_args+=(--project "${PROJECT}") + fi + if [[ -n "${DRY_RUN}" ]]; then + local_args+=(--dry-run) + fi + python3 "${SCRIPTS_DIR}/extract-sessions.py" "${local_args[@]}" "${EXTRA_ARGS[@]}" 2>&1 | tee -a "${LOG_FILE}" +fi + +# Phase B: Summarize +if [[ "${SUMMARIZE}" == true ]]; then + log "Phase B: Summarizing conversations..." + local_args=() + if [[ -n "${PROJECT}" ]]; then + local_args+=(--project "${PROJECT}") + fi + if [[ -n "${DRY_RUN}" ]]; then + local_args+=(--dry-run) + fi + python3 "${SCRIPTS_DIR}/summarize-conversations.py" "${local_args[@]}" "${EXTRA_ARGS[@]}" 2>&1 | tee -a "${LOG_FILE}" +fi + +# Phase C: Index +if [[ "${INDEX}" == true ]]; then + log "Phase C: Updating index and context..." + local_args=() + if [[ -z "${DRY_RUN}" ]]; then + local_args+=(--reindex) + fi + python3 "${SCRIPTS_DIR}/update-conversation-index.py" "${local_args[@]}" 2>&1 | tee -a "${LOG_FILE}" +fi + +log "=== Conversation mining complete ===" diff --git a/scripts/mine-prompt-v2.md b/scripts/mine-prompt-v2.md new file mode 100644 index 0000000..207f1ab --- /dev/null +++ b/scripts/mine-prompt-v2.md @@ -0,0 +1,40 @@ +You analyze AI coding assistant conversation transcripts and produce structured JSON summaries. + +Read the transcript, then output a single JSON object. No markdown fencing. No explanation. Just JSON. + +REQUIRED JSON STRUCTURE: + +{"trivial":false,"title":"...","summary":"...","halls":["fact"],"topics":["firebase-emulator","docker-compose"],"decisions":["..."],"discoveries":["..."],"preferences":["..."],"advice":["..."],"events":["..."],"tooling":["..."],"key_exchanges":[{"human":"...","assistant":"..."}],"related_topics":["..."]} + +FIELD RULES: + +title: 3-8 word descriptive title. NOT "Session XYZ". Describe what happened. + +summary: 2-3 sentences. What the human wanted. What the assistant did. What was the outcome. + +topics: REQUIRED. 1-4 kebab-case tags for the main subjects. Examples: firebase-emulator, blue-green-deploy, ci-pipeline, docker-hardening, database-migration, api-key-management, git-commit, test-failures. + +halls: Which knowledge types are present. Pick from: fact, discovery, preference, advice, event, tooling. +- fact = decisions made, config changed, choices locked in +- discovery = root causes, bugs found, breakthroughs +- preference = user working style or preferences +- advice = recommendations, lessons learned +- event = deployments, incidents, milestones +- tooling = scripts used, commands run, failures encountered + +decisions: State each decision as a fact. "Added restart policy to firebase service." +discoveries: State root cause clearly. "npm install failed because working directory was wrong." +preferences: Only if explicitly expressed. Usually empty. +advice: Recommendations made during the session. +events: Notable milestones or incidents. +tooling: Scripts, commands, and tools used. Note failures especially. + +key_exchanges: 1-3 most important moments. Paraphrase to 1 sentence each. + +related_topics: Secondary tags for cross-referencing to other wiki pages. + +trivial: Set true ONLY if < 3 meaningful exchanges and no decisions or discoveries. + +OMIT empty arrays — if no preferences were expressed, use "preferences": []. + +Output ONLY valid JSON. No markdown. No explanation. diff --git a/scripts/summarize-conversations.py b/scripts/summarize-conversations.py new file mode 100755 index 0000000..59b504d --- /dev/null +++ b/scripts/summarize-conversations.py @@ -0,0 +1,646 @@ +#!/usr/bin/env python3 +"""Summarize extracted conversation transcripts via LLM. + +Phase B of the conversation mining pipeline. Sends transcripts to a local +llama-server or Claude Code CLI for classification, summarization, and +key exchange selection. + +Handles chunking and incremental summarization. + +Usage: + python3 summarize-conversations.py # All unsummarized (local LLM) + python3 summarize-conversations.py --claude # Use claude -p (haiku/sonnet) + python3 summarize-conversations.py --claude --long 300 # Sonnet threshold: 300 msgs + python3 summarize-conversations.py --project mc # One project only + python3 summarize-conversations.py --file path.md # One file + python3 summarize-conversations.py --dry-run # Show what would be done + +Claude mode uses Haiku for short conversations (<= threshold) and Sonnet +for longer ones. Threshold default: 200 messages. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +# Force unbuffered output for background/pipe usage +sys.stdout.reconfigure(line_buffering=True) +sys.stderr.reconfigure(line_buffering=True) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki"))) +CONVERSATIONS_DIR = WIKI_DIR / "conversations" +MINE_STATE_FILE = WIKI_DIR / ".mine-state.json" +# Prompt file lives next to this script, not in $WIKI_DIR +MINE_PROMPT_FILE = Path(__file__).resolve().parent / "mine-prompt-v2.md" + +# Local LLM defaults (llama-server) +AI_BASE_URL = "http://localhost:8080/v1" +AI_MODEL = "Phi-4-14B-Q4_K_M" +AI_TOKEN = "dummy" +AI_TIMEOUT = 180 +AI_TEMPERATURE = 0.3 + +# Claude CLI defaults +CLAUDE_HAIKU_MODEL = "haiku" +CLAUDE_SONNET_MODEL = "sonnet" +CLAUDE_LONG_THRESHOLD = 200 # messages — above this, use Sonnet + +# Chunking parameters +# Local LLM: 8K context → ~3000 tokens content per chunk +MAX_CHUNK_CHARS_LOCAL = 12000 +MAX_ROLLING_CONTEXT_CHARS_LOCAL = 6000 +# Claude: 200K context → much larger chunks, fewer LLM calls +MAX_CHUNK_CHARS_CLAUDE = 80000 # ~20K tokens +MAX_ROLLING_CONTEXT_CHARS_CLAUDE = 20000 + + +def _update_config(base_url: str, model: str, timeout: int) -> None: + global AI_BASE_URL, AI_MODEL, AI_TIMEOUT + AI_BASE_URL = base_url + AI_MODEL = model + AI_TIMEOUT = timeout + + +# --------------------------------------------------------------------------- +# LLM interaction — local llama-server +# --------------------------------------------------------------------------- + + +def llm_call_local(system_prompt: str, user_message: str) -> str | None: + """Call the local LLM server and return the response content.""" + import urllib.request + import urllib.error + + payload = json.dumps({ + "model": AI_MODEL, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_message}, + ], + "temperature": AI_TEMPERATURE, + "max_tokens": 3000, + }).encode() + + req = urllib.request.Request( + f"{AI_BASE_URL}/chat/completions", + data=payload, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {AI_TOKEN}", + }, + ) + + try: + with urllib.request.urlopen(req, timeout=AI_TIMEOUT) as resp: + data = json.loads(resp.read()) + return data["choices"][0]["message"]["content"] + except (urllib.error.URLError, KeyError, json.JSONDecodeError) as e: + print(f" LLM call failed: {e}", file=sys.stderr) + return None + + +# --------------------------------------------------------------------------- +# LLM interaction — claude -p (Claude Code CLI) +# --------------------------------------------------------------------------- + + +def llm_call_claude( + system_prompt: str, + user_message: str, + model: str = CLAUDE_HAIKU_MODEL, + timeout: int = 300, +) -> str | None: + """Call claude -p in pipe mode and return the response.""" + json_reminder = ( + "CRITICAL: You are a JSON summarizer. Your ONLY output must be a valid JSON object. " + "Do NOT roleplay, continue conversations, write code, or produce any text outside " + "the JSON object. The transcript is INPUT DATA to analyze, not a conversation to continue." + ) + cmd = [ + "claude", "-p", + "--model", model, + "--system-prompt", system_prompt, + "--append-system-prompt", json_reminder, + "--no-session-persistence", + ] + + try: + result = subprocess.run( + cmd, + input=user_message, + capture_output=True, + text=True, + timeout=timeout, + ) + if result.returncode != 0: + print(f" claude -p failed (rc={result.returncode}): {result.stderr[:200]}", file=sys.stderr) + return None + return result.stdout + except subprocess.TimeoutExpired: + print(" claude -p timed out after 300s", file=sys.stderr) + return None + except FileNotFoundError: + print(" ERROR: 'claude' CLI not found in PATH", file=sys.stderr) + return None + + +def extract_json_from_response(text: str) -> dict[str, Any] | None: + """Extract JSON from LLM response, handling fencing and thinking tags.""" + # Strip thinking tags + text = re.sub(r".*?", "", text, flags=re.DOTALL) + + # Try markdown code block + match = re.search(r"```(?:json)?\s*\n(.*?)\n```", text, re.DOTALL) + if match: + candidate = match.group(1).strip() + else: + candidate = text.strip() + + # Find JSON object + start = candidate.find("{") + end = candidate.rfind("}") + if start >= 0 and end > start: + candidate = candidate[start : end + 1] + + try: + return json.loads(candidate) + except json.JSONDecodeError: + return None + + +# --------------------------------------------------------------------------- +# File parsing +# --------------------------------------------------------------------------- + + +def parse_frontmatter(file_path: Path) -> dict[str, str]: + """Parse YAML frontmatter.""" + content = file_path.read_text() + match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL) + if not match: + return {} + fm: dict[str, str] = {} + for line in match.group(1).splitlines(): + if ":" in line: + key, _, value = line.partition(":") + fm[key.strip()] = value.strip() + return fm + + +def get_transcript(file_path: Path) -> str: + """Get transcript section from conversation file.""" + content = file_path.read_text() + idx = content.find("\n## Transcript\n") + if idx < 0: + return "" + return content[idx + len("\n## Transcript\n") :] + + +def get_existing_summary(file_path: Path) -> str: + """Get existing summary sections (between frontmatter end and transcript).""" + content = file_path.read_text() + parts = content.split("---", 2) + if len(parts) < 3: + return "" + after_fm = parts[2] + idx = after_fm.find("## Transcript") + if idx < 0: + return "" + return after_fm[:idx].strip() + + +# --------------------------------------------------------------------------- +# Chunking +# --------------------------------------------------------------------------- + + +def chunk_text(text: str, max_chars: int) -> list[str]: + """Split text into chunks, breaking at paragraph boundaries.""" + if len(text) <= max_chars: + return [text] + + chunks: list[str] = [] + current = "" + + for line in text.splitlines(keepends=True): + if len(current) + len(line) > max_chars and current: + chunks.append(current) + current = line + else: + current += line + + if current: + chunks.append(current) + + return chunks + + +# --------------------------------------------------------------------------- +# Summarization +# --------------------------------------------------------------------------- + + +def select_claude_model(file_path: Path, long_threshold: int) -> str: + """Pick haiku or sonnet based on message count.""" + fm = parse_frontmatter(file_path) + try: + msg_count = int(fm.get("messages", "0")) + except ValueError: + msg_count = 0 + if msg_count > long_threshold: + return CLAUDE_SONNET_MODEL + return CLAUDE_HAIKU_MODEL + + +def summarize_file( + file_path: Path, + system_prompt: str, + dry_run: bool = False, + use_claude: bool = False, + long_threshold: int = CLAUDE_LONG_THRESHOLD, +) -> bool: + """Summarize a single conversation file. Returns True on success.""" + transcript = get_transcript(file_path) + if not transcript.strip(): + print(f" [skip] {file_path.name} — no transcript") + return False + + existing_summary = get_existing_summary(file_path) + is_incremental = "## Summary" in existing_summary + + # Pick chunk sizes based on provider + if use_claude: + max_chunk = MAX_CHUNK_CHARS_CLAUDE + max_rolling = MAX_ROLLING_CONTEXT_CHARS_CLAUDE + else: + max_chunk = MAX_CHUNK_CHARS_LOCAL + max_rolling = MAX_ROLLING_CONTEXT_CHARS_LOCAL + + chunks = chunk_text(transcript, max_chunk) + num_chunks = len(chunks) + + # Pick model for claude mode + claude_model = "" + if use_claude: + claude_model = select_claude_model(file_path, long_threshold) + + if dry_run: + mode = "incremental" if is_incremental else "new" + model_info = f", model={claude_model}" if use_claude else "" + print(f" [dry-run] {file_path.name} — {num_chunks} chunk(s) ({mode}{model_info})") + return True + + model_label = f" [{claude_model}]" if use_claude else "" + print(f" [summarize] {file_path.name} — {num_chunks} chunk(s)" + f"{' (incremental)' if is_incremental else ''}{model_label}") + + rolling_context = "" + if is_incremental: + rolling_context = f"EXISTING SUMMARY (extend, do not repeat):\n{existing_summary}\n\n" + + final_json: dict[str, Any] | None = None + start_time = time.time() + + for i, chunk in enumerate(chunks, 1): + if rolling_context: + user_msg = ( + f"{rolling_context}\n\n" + f"NEW CONVERSATION CONTENT (chunk {i}/{num_chunks}):\n{chunk}" + ) + else: + user_msg = f"CONVERSATION TRANSCRIPT (chunk {i}/{num_chunks}):\n{chunk}" + + if i == num_chunks: + user_msg += "\n\nThis is the FINAL chunk. Produce the complete JSON summary now." + else: + user_msg += "\n\nMore chunks follow. Produce a PARTIAL summary JSON for what you've seen so far." + + # Call the appropriate LLM (with retry on parse failure) + max_attempts = 2 + parsed = None + for attempt in range(1, max_attempts + 1): + if use_claude: + # Longer timeout for sonnet / multi-chunk conversations + call_timeout = 600 if claude_model == CLAUDE_SONNET_MODEL else 300 + response = llm_call_claude(system_prompt, user_msg, + model=claude_model, timeout=call_timeout) + else: + response = llm_call_local(system_prompt, user_msg) + + if not response: + print(f" [error] LLM call failed on chunk {i}/{num_chunks} (attempt {attempt})") + if attempt < max_attempts: + continue + return False + + parsed = extract_json_from_response(response) + if parsed: + break + + print(f" [warn] JSON parse failed on chunk {i}/{num_chunks} (attempt {attempt})") + if attempt < max_attempts: + print(f" Retrying...") + else: + # Log first 200 chars for debugging + print(f" Response preview: {response[:200]}", file=sys.stderr) + + if not parsed: + print(f" [error] JSON parse failed on chunk {i}/{num_chunks} after {max_attempts} attempts") + return False + + final_json = parsed + + # Build rolling context for next chunk + partial_summary = parsed.get("summary", "") + if partial_summary: + rolling_context = f"PARTIAL SUMMARY SO FAR:\n{partial_summary}" + decisions = parsed.get("decisions", []) + if decisions: + rolling_context += "\n\nKEY DECISIONS:\n" + "\n".join( + f"- {d}" for d in decisions[:5] + ) + if len(rolling_context) > max_rolling: + rolling_context = rolling_context[:max_rolling] + "..." + + if not final_json: + print(f" [error] No summary produced") + return False + + elapsed = time.time() - start_time + + # Apply the summary to the file + apply_summary(file_path, final_json) + + halls = final_json.get("halls", []) + topics = final_json.get("topics", []) + status = "trivial" if final_json.get("trivial") else "summarized" + + print( + f" [done] {file_path.name} — {status}, " + f"halls=[{', '.join(halls)}], " + f"topics=[{', '.join(topics)}] " + f"({elapsed:.0f}s)" + ) + return True + + +def apply_summary(file_path: Path, summary_json: dict[str, Any]) -> None: + """Apply LLM summary to the conversation markdown file.""" + content = file_path.read_text() + + # Parse existing frontmatter + fm_match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL) + if not fm_match: + return + + fm_lines = fm_match.group(1).splitlines() + + # Find transcript + transcript_idx = content.find("\n## Transcript\n") + transcript_section = content[transcript_idx:] if transcript_idx >= 0 else "" + + # Update frontmatter + is_trivial = summary_json.get("trivial", False) + new_status = "trivial" if is_trivial else "summarized" + title = summary_json.get("title", "Untitled Session") + halls = summary_json.get("halls", []) + topics = summary_json.get("topics", []) + related = summary_json.get("related_topics", []) + + fm_dict: dict[str, str] = {} + fm_key_order: list[str] = [] + for line in fm_lines: + if ":" in line: + key = line.partition(":")[0].strip() + val = line.partition(":")[2].strip() + fm_dict[key] = val + fm_key_order.append(key) + + fm_dict["title"] = title + fm_dict["status"] = new_status + if halls: + fm_dict["halls"] = "[" + ", ".join(halls) + "]" + if topics: + fm_dict["topics"] = "[" + ", ".join(topics) + "]" + if related: + fm_dict["related"] = "[" + ", ".join(related) + "]" + + # Add new keys + for key in ["halls", "topics", "related"]: + if key in fm_dict and key not in fm_key_order: + fm_key_order.append(key) + + new_fm = "\n".join(f"{k}: {fm_dict[k]}" for k in fm_key_order if k in fm_dict) + + # Build summary sections + sections: list[str] = [] + + summary_text = summary_json.get("summary", "") + if summary_text: + sections.append(f"## Summary\n\n{summary_text}") + + for hall_name, hall_label in [ + ("decisions", "Decisions (hall: fact)"), + ("discoveries", "Discoveries (hall: discovery)"), + ("preferences", "Preferences (hall: preference)"), + ("advice", "Advice (hall: advice)"), + ("events", "Events (hall: event)"), + ("tooling", "Tooling (hall: tooling)"), + ]: + items = summary_json.get(hall_name, []) + if items: + lines = [f"## {hall_label}\n"] + for item in items: + lines.append(f"- {item}") + sections.append("\n".join(lines)) + + exchanges = summary_json.get("key_exchanges", []) + if exchanges: + lines = ["## Key Exchanges\n"] + for ex in exchanges: + if isinstance(ex, dict): + human = ex.get("human", "") + assistant = ex.get("assistant", "") + lines.append(f"> **Human**: {human}") + lines.append(">") + lines.append(f"> **Assistant**: {assistant}") + lines.append("") + elif isinstance(ex, str): + lines.append(f"- {ex}") + sections.append("\n".join(lines)) + + # Assemble + output = f"---\n{new_fm}\n---\n\n" + if sections: + output += "\n\n".join(sections) + "\n\n---\n" + output += transcript_section + if not output.endswith("\n"): + output += "\n" + + file_path.write_text(output) + + +# --------------------------------------------------------------------------- +# Discovery +# --------------------------------------------------------------------------- + + +def find_files_to_summarize( + project_filter: str | None = None, + file_filter: str | None = None, +) -> list[Path]: + """Find conversation files needing summarization.""" + if file_filter: + p = Path(file_filter) + if p.exists(): + return [p] + p = WIKI_DIR / file_filter + if p.exists(): + return [p] + return [] + + search_dir = CONVERSATIONS_DIR + if project_filter: + search_dir = CONVERSATIONS_DIR / project_filter + + files: list[Path] = [] + for md_file in sorted(search_dir.rglob("*.md")): + if md_file.name in ("index.md", ".gitkeep"): + continue + fm = parse_frontmatter(md_file) + if fm.get("status") == "extracted": + files.append(md_file) + + return files + + +def update_mine_state(session_id: str, msg_count: int) -> None: + """Update summarized_through_msg in mine state.""" + if not MINE_STATE_FILE.exists(): + return + try: + with open(MINE_STATE_FILE) as f: + state = json.load(f) + if session_id in state.get("sessions", {}): + state["sessions"][session_id]["summarized_through_msg"] = msg_count + with open(MINE_STATE_FILE, "w") as f: + json.dump(state, f, indent=2) + except (json.JSONDecodeError, KeyError): + pass + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser(description="Summarize conversation transcripts") + parser.add_argument("--project", help="Only summarize this project code") + parser.add_argument("--file", help="Summarize a specific file") + parser.add_argument("--dry-run", action="store_true", help="Show what would be done") + parser.add_argument( + "--claude", action="store_true", + help="Use claude -p instead of local LLM (haiku for short, sonnet for long)", + ) + parser.add_argument( + "--long", type=int, default=CLAUDE_LONG_THRESHOLD, metavar="N", + help=f"Message count threshold for sonnet (default: {CLAUDE_LONG_THRESHOLD})", + ) + parser.add_argument("--ai-url", default=AI_BASE_URL) + parser.add_argument("--ai-model", default=AI_MODEL) + parser.add_argument("--ai-timeout", type=int, default=AI_TIMEOUT) + args = parser.parse_args() + + # Update module-level config from args (local LLM only) + _update_config(args.ai_url, args.ai_model, args.ai_timeout) + + # Load system prompt + if not MINE_PROMPT_FILE.exists(): + print(f"ERROR: Prompt not found: {MINE_PROMPT_FILE}", file=sys.stderr) + sys.exit(1) + system_prompt = MINE_PROMPT_FILE.read_text() + + # Find files + files = find_files_to_summarize(args.project, args.file) + if not files: + print("No conversations need summarization.") + return + + provider = "claude -p" if args.claude else f"local ({AI_MODEL})" + print(f"Found {len(files)} conversation(s) to summarize. Provider: {provider}") + + if args.dry_run: + for f in files: + summarize_file(f, system_prompt, dry_run=True, + use_claude=args.claude, long_threshold=args.long) + return + + # Check provider availability + if args.claude: + try: + result = subprocess.run( + ["claude", "--version"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode != 0: + print("ERROR: 'claude' CLI not working", file=sys.stderr) + sys.exit(1) + print(f"Claude CLI: {result.stdout.strip()}") + except (FileNotFoundError, subprocess.TimeoutExpired): + print("ERROR: 'claude' CLI not found in PATH", file=sys.stderr) + sys.exit(1) + else: + import urllib.request + import urllib.error + health_url = AI_BASE_URL.replace("/v1", "/health") + try: + urllib.request.urlopen(health_url, timeout=5) + except urllib.error.URLError: + print(f"ERROR: LLM server not responding at {health_url}", file=sys.stderr) + sys.exit(1) + + processed = 0 + errors = 0 + total_start = time.time() + + for i, f in enumerate(files, 1): + print(f"\n[{i}/{len(files)}]", end=" ") + try: + if summarize_file(f, system_prompt, use_claude=args.claude, + long_threshold=args.long): + processed += 1 + + # Update mine state + fm = parse_frontmatter(f) + sid = fm.get("session_id", "") + msgs = fm.get("messages", "0") + if sid: + try: + update_mine_state(sid, int(msgs)) + except ValueError: + pass + else: + errors += 1 + except Exception as e: + print(f" [crash] {f.name} — {e}", file=sys.stderr) + errors += 1 + + elapsed = time.time() - total_start + print(f"\nDone. Summarized: {processed}, Errors: {errors}, Time: {elapsed:.0f}s") + + +if __name__ == "__main__": + main() diff --git a/scripts/update-conversation-index.py b/scripts/update-conversation-index.py new file mode 100755 index 0000000..663d5a0 --- /dev/null +++ b/scripts/update-conversation-index.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python3 +"""Update conversation index and context files from summarized conversations. + +Phase C of the conversation mining pipeline. Reads all conversation markdown +files and regenerates: + - conversations/index.md — catalog organized by project + - context/wake-up.md — world briefing from recent conversations + - context/active-concerns.md — current blockers and open threads + +Usage: + python3 update-conversation-index.py + python3 update-conversation-index.py --reindex # Also triggers qmd update +""" + +from __future__ import annotations + +import argparse +import os +import re +import subprocess +import sys +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki"))) +CONVERSATIONS_DIR = WIKI_DIR / "conversations" +CONTEXT_DIR = WIKI_DIR / "context" +INDEX_FILE = CONVERSATIONS_DIR / "index.md" +WAKEUP_FILE = CONTEXT_DIR / "wake-up.md" +CONCERNS_FILE = CONTEXT_DIR / "active-concerns.md" + +# ════════════════════════════════════════════════════════════════════════════ +# CONFIGURE ME — Project code to display name mapping +# ════════════════════════════════════════════════════════════════════════════ +# +# Every project code you use in `extract-sessions.py`'s PROJECT_MAP should +# have a display name here. The conversation index groups conversations by +# these codes and renders them under sections named by the display name. +# +# Examples — replace with your own: +PROJECT_NAMES: dict[str, str] = { + "wiki": "WIKI — This Wiki", + "cl": "CL — Claude Config", + # "web": "WEB — My Webapp", + # "mob": "MOB — My Mobile App", + # "work": "WORK — Day Job", + "general": "General — Cross-Project", +} + +# Order for display — put your most-active projects first +PROJECT_ORDER = [ + # "work", "web", "mob", + "wiki", "cl", "general", +] + + +# --------------------------------------------------------------------------- +# Frontmatter parsing +# --------------------------------------------------------------------------- + + +def parse_frontmatter(file_path: Path) -> dict[str, str]: + """Parse YAML frontmatter from a markdown file.""" + fm: dict[str, str] = {} + content = file_path.read_text() + + # Find frontmatter between --- markers + match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL) + if not match: + return fm + + for line in match.group(1).splitlines(): + if ":" in line: + key, _, value = line.partition(":") + fm[key.strip()] = value.strip() + + return fm + + +def get_summary_line(file_path: Path) -> str: + """Extract the first sentence of the Summary section.""" + content = file_path.read_text() + match = re.search(r"## Summary\n\n(.+?)(?:\n\n|\n##)", content, re.DOTALL) + if match: + summary = match.group(1).strip() + # First sentence + first_sentence = summary.split(". ")[0] + if not first_sentence.endswith("."): + first_sentence += "." + # Truncate if too long + if len(first_sentence) > 120: + first_sentence = first_sentence[:117] + "..." + return first_sentence + return "No summary available." + + +def get_decisions(file_path: Path) -> list[str]: + """Extract decisions from a conversation file.""" + content = file_path.read_text() + decisions: list[str] = [] + match = re.search(r"## Decisions.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL) + if match: + for line in match.group(1).strip().splitlines(): + line = line.strip() + if line.startswith("- "): + decisions.append(line[2:]) + return decisions + + +def get_discoveries(file_path: Path) -> list[str]: + """Extract discoveries from a conversation file.""" + content = file_path.read_text() + discoveries: list[str] = [] + match = re.search(r"## Discoveries.*?\n(.*?)(?:\n##|\n---|\Z)", content, re.DOTALL) + if match: + for line in match.group(1).strip().splitlines(): + line = line.strip() + if line.startswith("- "): + discoveries.append(line[2:]) + return discoveries + + +# --------------------------------------------------------------------------- +# Conversation discovery +# --------------------------------------------------------------------------- + + +def discover_conversations() -> dict[str, list[dict[str, Any]]]: + """Discover all conversation files organized by project.""" + by_project: dict[str, list[dict[str, Any]]] = defaultdict(list) + + for project_dir in sorted(CONVERSATIONS_DIR.iterdir()): + if not project_dir.is_dir(): + continue + + project_code = project_dir.name + if project_code not in PROJECT_NAMES: + continue + + for md_file in sorted(project_dir.glob("*.md"), reverse=True): + if md_file.name == ".gitkeep": + continue + + fm = parse_frontmatter(md_file) + status = fm.get("status", "extracted") + + entry = { + "file": md_file, + "relative": md_file.relative_to(CONVERSATIONS_DIR), + "title": fm.get("title", md_file.stem), + "date": fm.get("date", "unknown"), + "status": status, + "messages": fm.get("messages", "0"), + "halls": fm.get("halls", ""), + "topics": fm.get("topics", ""), + "project": project_code, + } + + by_project[project_code].append(entry) + + return by_project + + +# --------------------------------------------------------------------------- +# Index generation +# --------------------------------------------------------------------------- + + +def generate_index(by_project: dict[str, list[dict[str, Any]]]) -> str: + """Generate the conversations/index.md content.""" + total = sum(len(convos) for convos in by_project.values()) + summarized = sum( + 1 + for convos in by_project.values() + for c in convos + if c["status"] == "summarized" + ) + trivial = sum( + 1 + for convos in by_project.values() + for c in convos + if c["status"] == "trivial" + ) + extracted = total - summarized - trivial + + lines = [ + "---", + "title: Conversation Index", + "type: index", + f"last_updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d')}", + "---", + "", + "# Conversation Index", + "", + f"Mined conversations from Claude Code sessions, organized by project (wing).", + "", + f"**{total} conversations** — {summarized} summarized, {extracted} pending, {trivial} trivial.", + "", + "---", + "", + ] + + for project_code in PROJECT_ORDER: + convos = by_project.get(project_code, []) + display_name = PROJECT_NAMES.get(project_code, project_code.upper()) + + lines.append(f"## {display_name}") + lines.append("") + + if not convos: + lines.append("_No conversations mined yet._") + lines.append("") + continue + + # Show summarized first, then extracted, skip trivial from listing + shown = 0 + for c in convos: + if c["status"] == "trivial": + continue + + status_tag = "" + if c["status"] == "extracted": + status_tag = " _(pending summary)_" + + # Get summary line if summarized + summary_text = "" + if c["status"] == "summarized": + summary_text = f" — {get_summary_line(c['file'])}" + + lines.append( + f"- [{c['title']}]({c['relative']})" + f" ({c['date']}, {c['messages']} msgs)" + f"{summary_text}{status_tag}" + ) + shown += 1 + + trivial_count = len(convos) - shown + if trivial_count > 0: + lines.append(f"\n_{trivial_count} trivial session(s) not listed._") + + lines.append("") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Context generation +# --------------------------------------------------------------------------- + + +def generate_wakeup(by_project: dict[str, list[dict[str, Any]]]) -> str: + """Generate context/wake-up.md from recent conversations.""" + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + # Determine activity level per project + project_activity: dict[str, dict[str, Any]] = {} + for code in PROJECT_ORDER: + convos = by_project.get(code, []) + summarized = [c for c in convos if c["status"] == "summarized"] + + if summarized: + latest = max(summarized, key=lambda c: c["date"]) + last_date = latest["date"] + # Simple activity heuristic: sessions in last 7 days = active + try: + dt = datetime.strptime(last_date, "%Y-%m-%d") + days_ago = (datetime.now() - dt).days + if days_ago <= 7: + status = "Active" + elif days_ago <= 30: + status = "Quiet" + else: + status = "Inactive" + except ValueError: + status = "Unknown" + last_date = "—" + else: + # Check extracted-only + if convos: + latest = max(convos, key=lambda c: c["date"]) + last_date = latest["date"] + status = "Active" if latest["date"] >= today[:7] else "Quiet" + else: + status = "—" + last_date = "—" + + project_activity[code] = { + "status": status, + "last_date": last_date, + "count": len(convos), + } + + # Gather recent decisions across all projects + recent_decisions: list[tuple[str, str, str]] = [] # (date, project, decision) + for code, convos in by_project.items(): + for c in convos: + if c["status"] != "summarized": + continue + for decision in get_decisions(c["file"]): + recent_decisions.append((c["date"], code, decision)) + + recent_decisions.sort(key=lambda x: x[0], reverse=True) + recent_decisions = recent_decisions[:10] # Top 10 most recent + + # Gather recent discoveries + recent_discoveries: list[tuple[str, str, str]] = [] + for code, convos in by_project.items(): + for c in convos: + if c["status"] != "summarized": + continue + for disc in get_discoveries(c["file"]): + recent_discoveries.append((c["date"], code, disc)) + + recent_discoveries.sort(key=lambda x: x[0], reverse=True) + recent_discoveries = recent_discoveries[:5] + + lines = [ + "---", + "title: Wake-Up Briefing", + "type: context", + f"last_updated: {today}", + "---", + "", + "# Wake-Up Briefing", + "", + "Auto-generated world state for AI session context.", + "", + "## Active Projects", + "", + "| Code | Project | Status | Last Activity | Sessions |", + "|------|---------|--------|---------------|----------|", + ] + + for code in PROJECT_ORDER: + if code == "general": + continue # Skip general from roster + info = project_activity.get(code, {"status": "—", "last_date": "—", "count": 0}) + display = PROJECT_NAMES.get(code, code).split(" — ")[1] if " — " in PROJECT_NAMES.get(code, "") else code + lines.append( + f"| {code.upper()} | {display} | {info['status']} | {info['last_date']} | {info['count']} |" + ) + + lines.append("") + + if recent_decisions: + lines.append("## Recent Decisions") + lines.append("") + for date, proj, decision in recent_decisions[:7]: + lines.append(f"- **[{proj.upper()}]** {decision} ({date})") + lines.append("") + + if recent_discoveries: + lines.append("## Recent Discoveries") + lines.append("") + for date, proj, disc in recent_discoveries[:5]: + lines.append(f"- **[{proj.upper()}]** {disc} ({date})") + lines.append("") + + if not recent_decisions and not recent_discoveries: + lines.append("## Recent Decisions") + lines.append("") + lines.append("_Populated after summarization runs._") + lines.append("") + + return "\n".join(lines) + + +def generate_concerns(by_project: dict[str, list[dict[str, Any]]]) -> str: + """Generate context/active-concerns.md from recent conversations.""" + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + # For now, this is a template that gets populated as summaries accumulate. + # Future enhancement: parse "blockers", "open questions" from summaries. + lines = [ + "---", + "title: Active Concerns", + "type: context", + f"last_updated: {today}", + "---", + "", + "# Active Concerns", + "", + "Auto-generated from recent conversations. Current blockers, deadlines, and open questions.", + "", + ] + + # Count recent activity to give a sense of what's hot + active_projects: list[tuple[str, int]] = [] + for code in PROJECT_ORDER: + convos = by_project.get(code, []) + recent = [c for c in convos if c["date"] >= today[:7]] # This month + if recent: + active_projects.append((code, len(recent))) + + if active_projects: + active_projects.sort(key=lambda x: x[1], reverse=True) + lines.append("## Current Focus Areas") + lines.append("") + for code, count in active_projects[:5]: + display = PROJECT_NAMES.get(code, code) + lines.append(f"- **{display}** — {count} session(s) this month") + lines.append("") + + lines.extend([ + "## Blockers", + "", + "_Populated from conversation analysis._", + "", + "## Open Questions", + "", + "_Populated from conversation analysis._", + "", + ]) + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Update conversation index and context files", + ) + parser.add_argument( + "--reindex", + action="store_true", + help="Also trigger qmd update and embed after updating files", + ) + args = parser.parse_args() + + # Discover all conversations + by_project = discover_conversations() + + total = sum(len(v) for v in by_project.values()) + print(f"Found {total} conversation(s) across {len(by_project)} projects.") + + # Generate and write index + index_content = generate_index(by_project) + INDEX_FILE.parent.mkdir(parents=True, exist_ok=True) + INDEX_FILE.write_text(index_content) + print(f"Updated {INDEX_FILE.relative_to(WIKI_DIR)}") + + # Generate and write context files (create dir if needed) + WAKEUP_FILE.parent.mkdir(parents=True, exist_ok=True) + wakeup_content = generate_wakeup(by_project) + WAKEUP_FILE.write_text(wakeup_content) + print(f"Updated {WAKEUP_FILE.relative_to(WIKI_DIR)}") + + concerns_content = generate_concerns(by_project) + CONCERNS_FILE.write_text(concerns_content) + print(f"Updated {CONCERNS_FILE.relative_to(WIKI_DIR)}") + + # Optionally trigger qmd reindex + if args.reindex: + print("Triggering qmd reindex...") + try: + subprocess.run(["qmd", "update"], check=True, capture_output=True) + subprocess.run(["qmd", "embed"], check=True, capture_output=True) + print("qmd index updated.") + except FileNotFoundError: + print("qmd not found — skipping reindex.", file=sys.stderr) + except subprocess.CalledProcessError as e: + print(f"qmd reindex failed: {e}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/scripts/wiki-harvest.py b/scripts/wiki-harvest.py new file mode 100755 index 0000000..5b1ba7d --- /dev/null +++ b/scripts/wiki-harvest.py @@ -0,0 +1,878 @@ +#!/usr/bin/env python3 +"""Harvest external reference URLs from summarized conversations into the wiki. + +Scans summarized conversation transcripts for URLs, classifies them, fetches +the content, stores the raw source under raw/harvested/, and optionally calls +`claude -p` to compile each raw file into a staging/ wiki page. + +Usage: + python3 scripts/wiki-harvest.py # Process all summarized conversations + python3 scripts/wiki-harvest.py --project mc # One project only + python3 scripts/wiki-harvest.py --file PATH # One conversation file + python3 scripts/wiki-harvest.py --dry-run # Show what would be harvested + python3 scripts/wiki-harvest.py --no-compile # Fetch only, skip claude -p compile step + python3 scripts/wiki-harvest.py --limit 10 # Cap number of URLs processed + +State is persisted in .harvest-state.json; existing URLs are deduplicated. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import re +import subprocess +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +# Force unbuffered output for pipe usage +sys.stdout.reconfigure(line_buffering=True) +sys.stderr.reconfigure(line_buffering=True) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki"))) +CONVERSATIONS_DIR = WIKI_DIR / "conversations" +RAW_HARVESTED_DIR = WIKI_DIR / "raw" / "harvested" +STAGING_DIR = WIKI_DIR / "staging" +INDEX_FILE = WIKI_DIR / "index.md" +CLAUDE_MD = WIKI_DIR / "CLAUDE.md" +HARVEST_STATE_FILE = WIKI_DIR / ".harvest-state.json" + +# ════════════════════════════════════════════════════════════════════════════ +# CONFIGURE ME — URL classification rules +# ════════════════════════════════════════════════════════════════════════════ +# +# Type D: always skip. Add your own internal/ephemeral/personal domains here. +# Patterns use `re.search` so unanchored suffixes like `\.example\.com$` work. +# Private IPs (10.x, 172.16-31.x, 192.168.x, 127.x) are detected separately. +SKIP_DOMAIN_PATTERNS = [ + # Generic: ephemeral / personal / chat / internal + r"\.atlassian\.net$", + r"^app\.asana\.com$", + r"^(www\.)?slack\.com$", + r"\.slack\.com$", + r"^(www\.)?discord\.com$", + r"^localhost$", + r"^0\.0\.0\.0$", + r"^mail\.google\.com$", + r"^calendar\.google\.com$", + r"^docs\.google\.com$", + r"^drive\.google\.com$", + r"^.+\.local$", + r"^.+\.internal$", + # Add your own internal domains below, for example: + # r"\.mycompany\.com$", + # r"^git\.mydomain\.com$", +] + +# Type C — issue trackers / Q&A; only harvest if topic touches existing wiki +C_TYPE_URL_PATTERNS = [ + r"^https?://github\.com/[^/]+/[^/]+/issues/\d+", + r"^https?://github\.com/[^/]+/[^/]+/pull/\d+", + r"^https?://github\.com/[^/]+/[^/]+/discussions/\d+", + r"^https?://(www\.)?stackoverflow\.com/questions/\d+", + r"^https?://(www\.)?serverfault\.com/questions/\d+", + r"^https?://(www\.)?superuser\.com/questions/\d+", + r"^https?://.+\.stackexchange\.com/questions/\d+", +] + +# Asset/image extensions to filter out +ASSET_EXTENSIONS = { + ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico", ".bmp", + ".css", ".js", ".mjs", ".woff", ".woff2", ".ttf", ".eot", + ".mp4", ".webm", ".mov", ".mp3", ".wav", + ".zip", ".tar", ".gz", ".bz2", +} + +# URL regex — HTTP(S), stops at whitespace, brackets, and common markdown delimiters +URL_REGEX = re.compile( + r"https?://[^\s<>\"')\]}\\|`]+", + re.IGNORECASE, +) + +# Claude CLI models +CLAUDE_HAIKU_MODEL = "haiku" +CLAUDE_SONNET_MODEL = "sonnet" +SONNET_CONTENT_THRESHOLD = 20_000 # chars — larger than this → sonnet + +# Fetch behavior +FETCH_DELAY_SECONDS = 2 +MAX_FAILED_ATTEMPTS = 3 +MIN_CONTENT_LENGTH = 100 +FETCH_TIMEOUT = 45 + +# HTML-leak detection — content containing any of these is treated as a failed extraction +HTML_LEAK_MARKERS = [" dict[str, Any]: + defaults: dict[str, Any] = { + "harvested_urls": {}, + "skipped_urls": {}, + "failed_urls": {}, + "rejected_urls": {}, + "last_run": None, + } + if HARVEST_STATE_FILE.exists(): + try: + with open(HARVEST_STATE_FILE) as f: + state = json.load(f) + for k, v in defaults.items(): + state.setdefault(k, v) + return state + except (OSError, json.JSONDecodeError): + pass + return defaults + + +def save_state(state: dict[str, Any]) -> None: + state["last_run"] = datetime.now(timezone.utc).isoformat() + tmp = HARVEST_STATE_FILE.with_suffix(".json.tmp") + with open(tmp, "w") as f: + json.dump(state, f, indent=2, sort_keys=True) + tmp.replace(HARVEST_STATE_FILE) + + +# --------------------------------------------------------------------------- +# URL extraction +# --------------------------------------------------------------------------- + + +def extract_urls_from_file(file_path: Path) -> list[str]: + """Extract all HTTP(S) URLs from a conversation markdown file. + + Filters: + - Asset URLs (images, CSS, JS, fonts, media, archives) + - URLs shorter than 20 characters + - Duplicates within the same file + """ + try: + text = file_path.read_text(errors="replace") + except OSError: + return [] + + seen: set[str] = set() + urls: list[str] = [] + + for match in URL_REGEX.finditer(text): + url = match.group(0).rstrip(".,;:!?") # strip trailing sentence punctuation + # Drop trailing markdown/code artifacts + while url and url[-1] in "()[]{}\"'": + url = url[:-1] + if len(url) < 20: + continue + try: + parsed = urlparse(url) + except ValueError: + continue + if not parsed.scheme or not parsed.netloc: + continue + path_lower = parsed.path.lower() + if any(path_lower.endswith(ext) for ext in ASSET_EXTENSIONS): + continue + if url in seen: + continue + seen.add(url) + urls.append(url) + + return urls + + +# --------------------------------------------------------------------------- +# URL classification +# --------------------------------------------------------------------------- + + +def _is_private_ip(host: str) -> bool: + """Return True if host is an RFC1918 or loopback IP literal.""" + if not re.match(r"^\d+\.\d+\.\d+\.\d+$", host): + return False + parts = [int(p) for p in host.split(".")] + if parts[0] == 10: + return True + if parts[0] == 127: + return True + if parts[0] == 172 and 16 <= parts[1] <= 31: + return True + if parts[0] == 192 and parts[1] == 168: + return True + return False + + +def classify_url(url: str) -> str: + """Classify a URL as 'harvest' (A/B), 'check' (C), or 'skip' (D).""" + try: + parsed = urlparse(url) + except ValueError: + return "skip" + + host = (parsed.hostname or "").lower() + if not host: + return "skip" + + if _is_private_ip(host): + return "skip" + + for pattern in SKIP_DOMAIN_PATTERNS: + if re.search(pattern, host): + return "skip" + + for pattern in C_TYPE_URL_PATTERNS: + if re.match(pattern, url): + return "check" + + return "harvest" + + +# --------------------------------------------------------------------------- +# Filename derivation +# --------------------------------------------------------------------------- + + +def slugify(text: str) -> str: + text = text.lower() + text = re.sub(r"[^a-z0-9]+", "-", text) + return text.strip("-") + + +def raw_filename_for_url(url: str) -> str: + parsed = urlparse(url) + host = parsed.netloc.lower().replace("www.", "") + path = parsed.path.rstrip("/") + host_slug = slugify(host) + path_slug = slugify(path) if path else "index" + # Truncate overly long names + if len(path_slug) > 80: + path_slug = path_slug[:80].rstrip("-") + return f"{host_slug}-{path_slug}.md" + + +# --------------------------------------------------------------------------- +# Fetch cascade +# --------------------------------------------------------------------------- + + +def run_fetch_command(cmd: list[str], timeout: int = FETCH_TIMEOUT) -> tuple[bool, str]: + """Run a fetch command and return (success, output).""" + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + ) + if result.returncode != 0: + return False, result.stderr.strip() or "non-zero exit" + return True, result.stdout + except subprocess.TimeoutExpired: + return False, "timeout" + except FileNotFoundError as e: + return False, f"command not found: {e}" + except OSError as e: + return False, str(e) + + +def validate_content(content: str) -> bool: + if not content or len(content.strip()) < MIN_CONTENT_LENGTH: + return False + low = content.lower() + if any(marker in low for marker in HTML_LEAK_MARKERS): + return False + return True + + +def fetch_with_trafilatura(url: str) -> tuple[bool, str]: + ok, out = run_fetch_command( + ["trafilatura", "-u", url, "--markdown", "--no-comments", "--precision"] + ) + if ok and validate_content(out): + return True, out + return False, out if not ok else "content validation failed" + + +def fetch_with_crawl4ai(url: str, stealth: bool = False) -> tuple[bool, str]: + cmd = ["crwl", url, "-o", "markdown-fit"] + if stealth: + cmd += [ + "-b", "headless=true,user_agent_mode=random", + "-c", "magic=true,scan_full_page=true,page_timeout=20000", + ] + else: + cmd += ["-c", "page_timeout=15000"] + ok, out = run_fetch_command(cmd, timeout=90) + if ok and validate_content(out): + return True, out + return False, out if not ok else "content validation failed" + + +def fetch_from_conversation(url: str, conversation_file: Path) -> tuple[bool, str]: + """Fallback: scrape a block of content near where the URL appears in the transcript. + + If the assistant fetched the URL during the session, some portion of the + content is likely inline in the transcript. + """ + try: + text = conversation_file.read_text(errors="replace") + except OSError: + return False, "cannot read conversation file" + + idx = text.find(url) + if idx == -1: + return False, "url not found in conversation" + + # Grab up to 2000 chars after the URL mention + snippet = text[idx : idx + 2000] + if not validate_content(snippet): + return False, "snippet failed validation" + return True, snippet + + +def fetch_cascade(url: str, conversation_file: Path) -> tuple[bool, str, str]: + """Attempt the full fetch cascade. Returns (success, content, method_used).""" + ok, out = fetch_with_trafilatura(url) + if ok: + return True, out, "trafilatura" + + ok, out = fetch_with_crawl4ai(url, stealth=False) + if ok: + return True, out, "crawl4ai" + + ok, out = fetch_with_crawl4ai(url, stealth=True) + if ok: + return True, out, "crawl4ai-stealth" + + ok, out = fetch_from_conversation(url, conversation_file) + if ok: + return True, out, "conversation-fallback" + + return False, out, "failed" + + +# --------------------------------------------------------------------------- +# Raw file storage +# --------------------------------------------------------------------------- + + +def content_hash(content: str) -> str: + return "sha256:" + hashlib.sha256(content.encode("utf-8")).hexdigest() + + +def write_raw_file( + url: str, + content: str, + method: str, + discovered_in: Path, +) -> Path: + RAW_HARVESTED_DIR.mkdir(parents=True, exist_ok=True) + filename = raw_filename_for_url(url) + out_path = RAW_HARVESTED_DIR / filename + # Collision: append short hash + if out_path.exists(): + suffix = hashlib.sha256(url.encode()).hexdigest()[:8] + out_path = RAW_HARVESTED_DIR / f"{out_path.stem}-{suffix}.md" + + rel_discovered = discovered_in.relative_to(WIKI_DIR) + frontmatter = [ + "---", + f"source_url: {url}", + f"fetched_date: {datetime.now(timezone.utc).date().isoformat()}", + f"fetch_method: {method}", + f"discovered_in: {rel_discovered}", + f"content_hash: {content_hash(content)}", + "---", + "", + ] + out_path.write_text("\n".join(frontmatter) + content.strip() + "\n") + return out_path + + +# --------------------------------------------------------------------------- +# AI compilation via claude -p +# --------------------------------------------------------------------------- + + +COMPILE_PROMPT_TEMPLATE = """You are compiling a raw harvested source document into the LLM wiki at {wiki_dir}. + +The wiki schema and conventions are defined in CLAUDE.md. The wiki has four +content directories: patterns/ (how), decisions/ (why), environments/ (where), +concepts/ (what). All pages require YAML frontmatter with title, type, +confidence, sources, related, last_compiled, last_verified. + +IMPORTANT: Do NOT include `status`, `origin`, `staged_*`, `target_path`, +`modifies`, `harvest_source`, or `compilation_notes` fields in your page +frontmatter — the harvest script injects those automatically. + +The raw source material is below. Decide what to do with it and emit the +result as a single JSON object on stdout (nothing else). Valid actions: + +- "new_page" — create a new wiki page +- "update_page" — update an existing wiki page (add source, merge content) +- "both" — create a new page AND update an existing one +- "skip" — content isn't substantive enough to warrant a wiki page + +JSON schema: + +{{ + "action": "new_page" | "update_page" | "both" | "skip", + "compilation_notes": "1-3 sentences explaining what you did and why", + "new_page": {{ + "directory": "patterns" | "decisions" | "environments" | "concepts", + "filename": "kebab-case-name.md", + "content": "full markdown including frontmatter" + }}, + "update_page": {{ + "path": "patterns/existing-page.md", + "content": "full updated markdown including frontmatter" + }} +}} + +Omit "new_page" if not applicable; omit "update_page" if not applicable. If +action is "skip", omit both. Do NOT include any prose outside the JSON. + +Wiki index (so you know what pages exist): + +{wiki_index} + +Raw harvested source: + +{raw_content} + +Conversation context (the working session where this URL was cited): + +{conversation_context} +""" + + +def call_claude_compile( + raw_path: Path, + raw_content: str, + conversation_file: Path, +) -> dict[str, Any] | None: + """Invoke `claude -p` to compile the raw source into a staging wiki page.""" + + # Pick model by size + model = CLAUDE_SONNET_MODEL if len(raw_content) > SONNET_CONTENT_THRESHOLD else CLAUDE_HAIKU_MODEL + + try: + wiki_index = INDEX_FILE.read_text()[:20_000] + except OSError: + wiki_index = "" + + try: + conversation_context = conversation_file.read_text(errors="replace")[:8_000] + except OSError: + conversation_context = "" + + prompt = COMPILE_PROMPT_TEMPLATE.format( + wiki_dir=str(WIKI_DIR), + wiki_index=wiki_index, + raw_content=raw_content[:40_000], + conversation_context=conversation_context, + ) + + try: + result = subprocess.run( + ["claude", "-p", "--model", model, "--output-format", "text", prompt], + capture_output=True, + text=True, + timeout=600, + ) + except FileNotFoundError: + print(" [warn] claude CLI not found — skipping compilation", file=sys.stderr) + return None + except subprocess.TimeoutExpired: + print(" [warn] claude -p timed out", file=sys.stderr) + return None + + if result.returncode != 0: + print(f" [warn] claude -p failed: {result.stderr.strip()[:200]}", file=sys.stderr) + return None + + # Extract JSON from output (may be wrapped in fences) + output = result.stdout.strip() + match = re.search(r"\{.*\}", output, re.DOTALL) + if not match: + print(f" [warn] no JSON found in claude output ({len(output)} chars)", file=sys.stderr) + return None + try: + return json.loads(match.group(0)) + except json.JSONDecodeError as e: + print(f" [warn] JSON parse failed: {e}", file=sys.stderr) + return None + + +STAGING_INJECT_TEMPLATE = ( + "---\n" + "origin: automated\n" + "status: pending\n" + "staged_date: {staged_date}\n" + "staged_by: wiki-harvest\n" + "target_path: {target_path}\n" + "{modifies_line}" + "harvest_source: {source_url}\n" + "compilation_notes: {compilation_notes}\n" +) + + +def _inject_staging_frontmatter( + content: str, + source_url: str, + target_path: str, + compilation_notes: str, + modifies: str | None, +) -> str: + """Insert staging metadata after the opening --- fence of the AI-generated content.""" + # Strip existing status/origin/staged fields the AI may have added + content = re.sub(r"^(status|origin|staged_\w+|target_path|modifies|harvest_source|compilation_notes):.*\n", "", content, flags=re.MULTILINE) + + modifies_line = f"modifies: {modifies}\n" if modifies else "" + # Collapse multi-line compilation notes to single line for safe YAML + clean_notes = compilation_notes.replace("\n", " ").replace("\r", " ").strip() + injection = STAGING_INJECT_TEMPLATE.format( + staged_date=datetime.now(timezone.utc).date().isoformat(), + target_path=target_path, + modifies_line=modifies_line, + source_url=source_url, + compilation_notes=clean_notes or "(none provided)", + ) + + if content.startswith("---\n"): + return injection + content[4:] + # AI forgot the fence — prepend full frontmatter + return injection + "---\n" + content + + +def _unique_staging_path(base: Path) -> Path: + """Append a short hash if the target already exists.""" + if not base.exists(): + return base + suffix = hashlib.sha256(str(base).encode() + str(time.time()).encode()).hexdigest()[:6] + return base.with_stem(f"{base.stem}-{suffix}") + + +def apply_compile_result( + result: dict[str, Any], + source_url: str, + raw_path: Path, +) -> list[Path]: + """Write the AI compilation result into staging/. Returns paths written.""" + written: list[Path] = [] + action = result.get("action", "skip") + if action == "skip": + return written + + notes = result.get("compilation_notes", "") + + # New page + new_page = result.get("new_page") or {} + if action in ("new_page", "both") and new_page.get("filename") and new_page.get("content"): + directory = new_page.get("directory", "patterns") + filename = new_page["filename"] + target_rel = f"{directory}/{filename}" + dest = _unique_staging_path(STAGING_DIR / target_rel) + dest.parent.mkdir(parents=True, exist_ok=True) + content = _inject_staging_frontmatter( + new_page["content"], + source_url=source_url, + target_path=target_rel, + compilation_notes=notes, + modifies=None, + ) + dest.write_text(content) + written.append(dest) + + # Update to existing page + update_page = result.get("update_page") or {} + if action in ("update_page", "both") and update_page.get("path") and update_page.get("content"): + target_rel = update_page["path"] + dest = _unique_staging_path(STAGING_DIR / target_rel) + dest.parent.mkdir(parents=True, exist_ok=True) + content = _inject_staging_frontmatter( + update_page["content"], + source_url=source_url, + target_path=target_rel, + compilation_notes=notes, + modifies=target_rel, + ) + dest.write_text(content) + written.append(dest) + + return written + + +# --------------------------------------------------------------------------- +# Wiki topic coverage check (for C-type URLs) +# --------------------------------------------------------------------------- + + +def wiki_covers_topic(url: str) -> bool: + """Quick heuristic: check if any wiki page mentions terms from the URL path. + + Used for C-type URLs (GitHub issues, SO questions) — only harvest if the + wiki already covers the topic. + """ + try: + parsed = urlparse(url) + except ValueError: + return False + + # Derive candidate keywords from path + path_terms = [t for t in re.split(r"[/\-_]+", parsed.path.lower()) if len(t) >= 4] + if not path_terms: + return False + + # Try qmd search if available; otherwise fall back to a simple grep + query = " ".join(path_terms[:5]) + try: + result = subprocess.run( + ["qmd", "search", query, "--json", "-n", "3"], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode == 0 and result.stdout.strip(): + try: + data = json.loads(result.stdout) + hits = data.get("results") if isinstance(data, dict) else data + return bool(hits) + except json.JSONDecodeError: + return False + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + return False + + +# --------------------------------------------------------------------------- +# Conversation discovery +# --------------------------------------------------------------------------- + + +def parse_frontmatter(file_path: Path) -> dict[str, str]: + fm: dict[str, str] = {} + try: + text = file_path.read_text(errors="replace") + except OSError: + return fm + if not text.startswith("---\n"): + return fm + end = text.find("\n---\n", 4) + if end == -1: + return fm + for line in text[4:end].splitlines(): + if ":" in line: + key, _, value = line.partition(":") + fm[key.strip()] = value.strip() + return fm + + +def discover_summarized_conversations( + project_filter: str | None = None, + file_filter: str | None = None, +) -> list[Path]: + if file_filter: + path = Path(file_filter) + if not path.is_absolute(): + path = WIKI_DIR / path + return [path] if path.exists() else [] + + files: list[Path] = [] + for project_dir in sorted(CONVERSATIONS_DIR.iterdir()): + if not project_dir.is_dir(): + continue + if project_filter and project_dir.name != project_filter: + continue + for md in sorted(project_dir.glob("*.md")): + fm = parse_frontmatter(md) + if fm.get("status") == "summarized": + files.append(md) + return files + + +# --------------------------------------------------------------------------- +# Main pipeline +# --------------------------------------------------------------------------- + + +def process_url( + url: str, + conversation_file: Path, + state: dict[str, Any], + dry_run: bool, + compile_enabled: bool, +) -> str: + """Process a single URL. Returns a short status tag for logging.""" + + rel_conv = str(conversation_file.relative_to(WIKI_DIR)) + today = datetime.now(timezone.utc).date().isoformat() + + # Already harvested? + if url in state["harvested_urls"]: + entry = state["harvested_urls"][url] + if rel_conv not in entry.get("seen_in", []): + entry.setdefault("seen_in", []).append(rel_conv) + return "dup-harvested" + + # Already rejected by AI? + if url in state["rejected_urls"]: + return "dup-rejected" + + # Previously skipped? + if url in state["skipped_urls"]: + return "dup-skipped" + + # Previously failed too many times? + if url in state["failed_urls"]: + if state["failed_urls"][url].get("attempts", 0) >= MAX_FAILED_ATTEMPTS: + return "dup-failed" + + # Classify + classification = classify_url(url) + if classification == "skip": + state["skipped_urls"][url] = { + "reason": "domain-skip-list", + "first_seen": today, + } + return "skip-domain" + + if classification == "check": + if not wiki_covers_topic(url): + state["skipped_urls"][url] = { + "reason": "c-type-no-wiki-match", + "first_seen": today, + } + return "skip-c-type" + + if dry_run: + return f"would-harvest ({classification})" + + # Fetch + print(f" [fetch] {url}") + ok, content, method = fetch_cascade(url, conversation_file) + time.sleep(FETCH_DELAY_SECONDS) + + if not ok: + entry = state["failed_urls"].setdefault(url, { + "first_seen": today, + "attempts": 0, + }) + entry["attempts"] += 1 + entry["last_attempt"] = today + entry["reason"] = content[:200] if content else "unknown" + return f"fetch-failed ({method})" + + # Save raw file + raw_path = write_raw_file(url, content, method, conversation_file) + rel_raw = str(raw_path.relative_to(WIKI_DIR)) + + state["harvested_urls"][url] = { + "first_seen": today, + "seen_in": [rel_conv], + "raw_file": rel_raw, + "wiki_pages": [], + "status": "raw", + "fetch_method": method, + "last_checked": today, + } + + # Compile via claude -p + if compile_enabled: + print(f" [compile] {rel_raw}") + result = call_claude_compile(raw_path, content, conversation_file) + if result is None: + state["harvested_urls"][url]["status"] = "raw-compile-failed" + return f"raw-saved ({method}) compile-failed" + + action = result.get("action", "skip") + if action == "skip": + state["rejected_urls"][url] = { + "reason": result.get("compilation_notes", "AI rejected"), + "rejected_date": today, + } + # Remove from harvested; keep raw file for audit + state["harvested_urls"].pop(url, None) + return f"rejected ({method})" + + written = apply_compile_result(result, url, raw_path) + state["harvested_urls"][url]["status"] = "compiled" + state["harvested_urls"][url]["wiki_pages"] = [ + str(p.relative_to(WIKI_DIR)) for p in written + ] + return f"compiled ({method}) → {len(written)} staging file(s)" + + return f"raw-saved ({method})" + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0]) + parser.add_argument("--project", help="Only process this project (wing) directory") + parser.add_argument("--file", help="Only process this conversation file") + parser.add_argument("--dry-run", action="store_true", help="Classify and report without fetching") + parser.add_argument("--no-compile", action="store_true", help="Fetch raw only; skip claude -p compile") + parser.add_argument("--limit", type=int, default=0, help="Stop after N new URLs processed (0 = no limit)") + args = parser.parse_args() + + files = discover_summarized_conversations(args.project, args.file) + print(f"Scanning {len(files)} summarized conversation(s) for URLs...") + + state = load_state() + stats: dict[str, int] = {} + processed_new = 0 + + for file_path in files: + urls = extract_urls_from_file(file_path) + if not urls: + continue + rel = file_path.relative_to(WIKI_DIR) + print(f"\n[{rel}] {len(urls)} URL(s)") + + for url in urls: + status = process_url( + url, + file_path, + state, + dry_run=args.dry_run, + compile_enabled=not args.no_compile, + ) + stats[status] = stats.get(status, 0) + 1 + print(f" [{status}] {url}") + + # Persist state after each non-dry URL + if not args.dry_run and not status.startswith("dup-"): + processed_new += 1 + save_state(state) + + if args.limit and processed_new >= args.limit: + print(f"\nLimit reached ({args.limit}); stopping.") + save_state(state) + _print_summary(stats) + return 0 + + if not args.dry_run: + save_state(state) + + _print_summary(stats) + return 0 + + +def _print_summary(stats: dict[str, int]) -> None: + print("\nSummary:") + for status, count in sorted(stats.items()): + print(f" {status}: {count}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/wiki-hygiene.py b/scripts/wiki-hygiene.py new file mode 100755 index 0000000..ed20b1b --- /dev/null +++ b/scripts/wiki-hygiene.py @@ -0,0 +1,1587 @@ +#!/usr/bin/env python3 +"""Automated wiki hygiene — quick (no LLM) and full (LLM) modes. + +Implements Plan 02 (staleness & archive) and Plan 04 (automated hygiene). + +Quick mode checks (daily, no LLM): + - Backfill missing last_verified + - Refresh last_verified from conversation references + - Auto-restore archived pages referenced again + - Confidence decay per thresholds + - Archive stale and superseded pages + - Frontmatter repair (missing required fields) + - Orphan pages (no inbound links) + - Broken cross-references (with fuzzy-match fix) + - Main index drift (missing/orphan entries) + - Empty stubs (report-only) + - State file drift (report-only) + - Staging/archive index resync + +Full mode checks (weekly, LLM-powered, extends quick): + - Missing cross-references (haiku) + - Duplicate coverage (sonnet) + - Contradictions (sonnet, report-only) + - Technology lifecycle (haiku) + +Usage: + python3 scripts/wiki-hygiene.py # Quick mode (default) + python3 scripts/wiki-hygiene.py --quick # Explicit quick + python3 scripts/wiki-hygiene.py --full # Full mode (quick + LLM) + python3 scripts/wiki-hygiene.py --dry-run # Show what would change + python3 scripts/wiki-hygiene.py --check-only # Report only, no auto-fixes + python3 scripts/wiki-hygiene.py --backfill # Backfill last_verified only + python3 scripts/wiki-hygiene.py --scan-refs # Refresh from conversation refs only + python3 scripts/wiki-hygiene.py --archive PATH # Manually archive a page + python3 scripts/wiki-hygiene.py --restore PATH # Manually restore an archived page +""" + +from __future__ import annotations + +import argparse +import difflib +import json +import re +import subprocess +import sys +from dataclasses import dataclass, field +from datetime import date, datetime, timezone +from pathlib import Path +from typing import Any + +sys.path.insert(0, str(Path(__file__).parent)) +from wiki_lib import ( # noqa: E402 + ARCHIVE_DIR, + ARCHIVE_INDEX, + CONVERSATIONS_DIR, + HARVEST_STATE_FILE, + INDEX_FILE, + LIVE_CONTENT_DIRS, + REPORTS_DIR, + STAGING_DIR, + STAGING_INDEX, + WIKI_DIR, + WikiPage, + iter_archived_pages, + iter_live_pages, + iter_staging_pages, + page_content_hash, + parse_date, + parse_page, + today, + write_page, +) + +sys.stdout.reconfigure(line_buffering=True) +sys.stderr.reconfigure(line_buffering=True) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +HYGIENE_STATE_FILE = WIKI_DIR / ".hygiene-state.json" +MINE_STATE_FILE = WIKI_DIR / ".mine-state.json" + +# Decay thresholds in days since last_verified +DECAY_HIGH_TO_MEDIUM = 180 +DECAY_MEDIUM_TO_LOW = 270 +DECAY_LOW_TO_STALE = 365 + +CONFIDENCE_ORDER = ["stale", "low", "medium", "high"] +VALID_CONFIDENCE = {"high", "medium", "low", "stale"} +VALID_TYPES = {"pattern", "decision", "environment", "concept"} + +EMPTY_STUB_THRESHOLD = 100 # body chars below which a page is a stub + +# Required fields per type — missing → auto-fix +REQUIRED_FIELDS = ["title", "type", "confidence", "last_compiled", "last_verified"] + +# LLM call defaults +CLAUDE_TIMEOUT = 300 +CLAUDE_HAIKU = "haiku" +CLAUDE_SONNET = "sonnet" + +# Tech version patterns for lifecycle check +VERSION_REGEX = re.compile( + r"\b(?:Node(?:\.js)?|Python|Docker|PostgreSQL|MySQL|Redis|Next\.js|NestJS)\s+(\d+(?:\.\d+)?)", + re.IGNORECASE, +) + + +# --------------------------------------------------------------------------- +# Hygiene state (.hygiene-state.json) +# --------------------------------------------------------------------------- + + +def load_hygiene_state() -> dict[str, Any]: + if HYGIENE_STATE_FILE.exists(): + try: + with open(HYGIENE_STATE_FILE) as f: + return json.load(f) + except (OSError, json.JSONDecodeError): + pass + return { + "last_quick_run": None, + "last_full_run": None, + "pages_checked": {}, + "deferred_issues": [], + } + + +def save_hygiene_state(state: dict[str, Any]) -> None: + tmp = HYGIENE_STATE_FILE.with_suffix(".json.tmp") + with open(tmp, "w") as f: + json.dump(state, f, indent=2, sort_keys=True) + tmp.replace(HYGIENE_STATE_FILE) + + +def mark_page_checked(state: dict[str, Any], page: WikiPage, mode: str) -> None: + rel = str(page.path.relative_to(WIKI_DIR)) + entry = state.setdefault("pages_checked", {}).setdefault(rel, {}) + now = datetime.now(timezone.utc).isoformat() + if mode == "quick": + entry["last_checked_quick"] = now + elif mode == "full": + entry["last_checked_full"] = now + entry["content_hash"] = page_content_hash(page) + + +def page_changed_since(state: dict[str, Any], page: WikiPage, mode: str) -> bool: + rel = str(page.path.relative_to(WIKI_DIR)) + entry = state.get("pages_checked", {}).get(rel, {}) + stored_hash = entry.get("content_hash") + if not stored_hash: + return True + return stored_hash != page_content_hash(page) + + +def is_deferred(state: dict[str, Any], issue_type: str, pages: list[str]) -> bool: + sorted_pages = sorted(pages) + for issue in state.get("deferred_issues", []): + if issue.get("type") == issue_type and sorted(issue.get("pages", [])) == sorted_pages: + return True + return False + + +# --------------------------------------------------------------------------- +# Date / git helpers +# --------------------------------------------------------------------------- + + +def git_first_commit_date(path: Path) -> date | None: + try: + result = subprocess.run( + ["git", "-C", str(WIKI_DIR), "log", "--diff-filter=A", "--format=%cs", "--", str(path.relative_to(WIKI_DIR))], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0 and result.stdout.strip(): + return parse_date(result.stdout.strip().splitlines()[-1]) + except (subprocess.TimeoutExpired, OSError): + pass + return None + + +def file_mtime_date(path: Path) -> date: + return datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc).date() + + +# --------------------------------------------------------------------------- +# Backfill last_verified +# --------------------------------------------------------------------------- + + +def backfill_last_verified(dry_run: bool = False) -> list[tuple[Path, str, date]]: + changes: list[tuple[Path, str, date]] = [] + for page in iter_live_pages(): + if "last_verified" in page.frontmatter and parse_date(page.frontmatter["last_verified"]): + continue + + source = "mtime" + d = parse_date(page.frontmatter.get("last_compiled")) + if d: + source = "last_compiled" + else: + d = git_first_commit_date(page.path) + if d: + source = "git" + else: + d = file_mtime_date(page.path) + + changes.append((page.path, source, d)) + if not dry_run: + page.frontmatter["last_verified"] = d.isoformat() + write_page(page) + return changes + + +# --------------------------------------------------------------------------- +# Frontmatter repair +# --------------------------------------------------------------------------- + + +def repair_frontmatter(dry_run: bool = False) -> list[tuple[Path, list[str]]]: + """Add missing required fields with sensible defaults. Returns list of (page, fields_fixed).""" + changes: list[tuple[Path, list[str]]] = [] + for page in iter_live_pages(): + fixes: list[str] = [] + fm = page.frontmatter + + if "title" not in fm: + fm["title"] = page.path.stem.replace("-", " ").title() + fixes.append("title") + + if "type" not in fm or fm["type"] not in VALID_TYPES: + inferred = page.path.parent.name.rstrip("s") + if inferred in VALID_TYPES: + fm["type"] = inferred + fixes.append("type") + + if "confidence" not in fm or str(fm.get("confidence")) not in VALID_CONFIDENCE: + fm["confidence"] = "medium" + fixes.append("confidence") + + if "last_compiled" not in fm or not parse_date(fm.get("last_compiled")): + d = git_first_commit_date(page.path) or file_mtime_date(page.path) + fm["last_compiled"] = d.isoformat() + fixes.append("last_compiled") + + if "last_verified" not in fm or not parse_date(fm.get("last_verified")): + fm["last_verified"] = fm.get("last_compiled") or today().isoformat() + fixes.append("last_verified") + + if "sources" not in fm: + fm["sources"] = [] + fixes.append("sources") + + if "related" not in fm: + fm["related"] = [] + fixes.append("related") + + if fixes: + changes.append((page.path, fixes)) + if not dry_run: + write_page(page) + return changes + + +# --------------------------------------------------------------------------- +# Confidence decay +# --------------------------------------------------------------------------- + + +def expected_confidence(current: str, last_verified: date | None, is_superseded: bool) -> str: + if is_superseded: + return "stale" + if not last_verified: + return current + elapsed = (today() - last_verified).days + if elapsed >= DECAY_LOW_TO_STALE: + return "stale" + if elapsed >= DECAY_MEDIUM_TO_LOW: + return _min_confidence(current, "low") + if elapsed >= DECAY_HIGH_TO_MEDIUM: + return _min_confidence(current, "medium") + return current + + +def _min_confidence(a: str, b: str) -> str: + order = {c: i for i, c in enumerate(CONFIDENCE_ORDER)} + ai = order.get(a, len(CONFIDENCE_ORDER)) + bi = order.get(b, len(CONFIDENCE_ORDER)) + return CONFIDENCE_ORDER[min(ai, bi)] + + +def bump_confidence(current: str) -> str: + idx = CONFIDENCE_ORDER.index(current) if current in CONFIDENCE_ORDER else 0 + return CONFIDENCE_ORDER[min(idx + 1, len(CONFIDENCE_ORDER) - 1)] + + +# --------------------------------------------------------------------------- +# Archive / Restore +# --------------------------------------------------------------------------- + + +def archive_page(page: WikiPage, reason: str, dry_run: bool = False) -> Path | None: + rel = page.path.relative_to(WIKI_DIR) + parts = rel.parts + if len(parts) < 2 or parts[0] not in LIVE_CONTENT_DIRS: + print(f" [warn] cannot archive {rel} — not a live content page", file=sys.stderr) + return None + + dest = ARCHIVE_DIR / rel + original_path = str(rel) + + if dry_run: + print(f" [dry-run] archive {rel} → {dest.relative_to(WIKI_DIR)} ({reason})") + return dest + + dest.parent.mkdir(parents=True, exist_ok=True) + page.frontmatter["archived_date"] = today().isoformat() + page.frontmatter["archived_reason"] = reason + page.frontmatter["original_path"] = original_path + page.frontmatter["confidence"] = "stale" + page.path.rename(dest) + page.path = dest + write_page(page) + + _remove_from_main_index(original_path) + _append_to_archive_index(dest, original_path, reason) + _rewrite_cross_references(original_path, f"archive/{original_path}") + return dest + + +def restore_page(page: WikiPage, dry_run: bool = False) -> Path | None: + original_path = page.frontmatter.get("original_path") + if not original_path: + rel = page.path.relative_to(ARCHIVE_DIR) + original_path = str(rel) + + dest = WIKI_DIR / original_path + if dry_run: + print(f" [dry-run] restore {page.path.relative_to(WIKI_DIR)} → {original_path}") + return dest + + dest.parent.mkdir(parents=True, exist_ok=True) + for key in ("archived_date", "archived_reason", "original_path"): + page.frontmatter.pop(key, None) + page.frontmatter["confidence"] = "medium" + page.frontmatter["last_verified"] = today().isoformat() + old = page.path + page.path.rename(dest) + page.path = dest + write_page(page) + + _remove_from_archive_index(str(old.relative_to(ARCHIVE_DIR))) + _rewrite_cross_references(f"archive/{original_path}", original_path) + return dest + + +# --------------------------------------------------------------------------- +# Index I/O +# --------------------------------------------------------------------------- + + +def _remove_from_main_index(original_path: str) -> None: + if not INDEX_FILE.exists(): + return + text = INDEX_FILE.read_text() + lines = text.splitlines(keepends=True) + pattern = re.compile(rf"^- \[.+\]\({re.escape(original_path)}\) ") + new_lines = [line for line in lines if not pattern.match(line)] + if len(new_lines) != len(lines): + INDEX_FILE.write_text("".join(new_lines)) + + +def _append_to_archive_index(archived_path: Path, original_path: str, reason: str) -> None: + ARCHIVE_INDEX.parent.mkdir(parents=True, exist_ok=True) + if not ARCHIVE_INDEX.exists(): + ARCHIVE_INDEX.write_text(_default_archive_index()) + text = ARCHIVE_INDEX.read_text() + name = archived_path.stem.replace("-", " ").title() + rel_in_archive = archived_path.relative_to(ARCHIVE_DIR) + row = f"| [{name}]({rel_in_archive}) | {original_path} | {today().isoformat()} | {reason} |\n" + text = text.replace("| _(none yet)_ | | | |\n", "") + if row.strip() in text: + return + ARCHIVE_INDEX.write_text(text.rstrip() + "\n" + row) + + +def _remove_from_archive_index(rel_in_archive: str) -> None: + if not ARCHIVE_INDEX.exists(): + return + text = ARCHIVE_INDEX.read_text() + pattern = re.compile(rf"^\|\s*\[.+\]\({re.escape(rel_in_archive)}\).*\n", re.MULTILINE) + new_text = pattern.sub("", text) + if new_text != text: + ARCHIVE_INDEX.write_text(new_text) + + +def _default_archive_index() -> str: + return ( + "# Archived Wiki Pages\n\n" + "Pages archived due to staleness or obsolescence.\n\n" + "## Archived Pages\n\n" + "| Page | Original Location | Archived | Reason |\n" + "|------|-------------------|----------|--------|\n" + ) + + +def _add_to_main_index(rel_path: str, title: str, summary: str = "") -> None: + if not INDEX_FILE.exists(): + return + text = INDEX_FILE.read_text() + if f"]({rel_path})" in text: + return + entry = f"- [{title}]({rel_path})" + if summary: + entry += f" — {summary}" + entry += "\n" + ptype = rel_path.split("/")[0] + section_headers = { + "patterns": "## Patterns", + "decisions": "## Decisions", + "concepts": "## Concepts", + "environments": "## Environments", + } + header = section_headers.get(ptype) + if header and header in text: + idx = text.find(header) + next_header = text.find("\n## ", idx + len(header)) + if next_header == -1: + next_header = len(text) + section = text[idx:next_header] + last_nl = section.rfind("\n", 0, len(section) - 1) + 1 + INDEX_FILE.write_text(text[: idx + last_nl] + entry + text[idx + last_nl :]) + else: + INDEX_FILE.write_text(text.rstrip() + "\n" + entry) + + +# --------------------------------------------------------------------------- +# Cross-reference rewriting +# --------------------------------------------------------------------------- + + +def _rewrite_cross_references(old_path: str, new_path: str) -> int: + targets: list[Path] = [INDEX_FILE] + for sub in LIVE_CONTENT_DIRS: + targets.extend((WIKI_DIR / sub).glob("*.md")) + if STAGING_DIR.exists(): + for sub in LIVE_CONTENT_DIRS: + targets.extend((STAGING_DIR / sub).glob("*.md")) + if ARCHIVE_DIR.exists(): + for sub in LIVE_CONTENT_DIRS: + targets.extend((ARCHIVE_DIR / sub).glob("*.md")) + + count = 0 + old_esc = re.escape(old_path) + link_patterns = [ + (re.compile(rf"\]\({old_esc}\)"), f"]({new_path})"), + (re.compile(rf"\]\(\.\./{old_esc}\)"), f"](../{new_path})"), + ] + related_patterns = [ + (re.compile(rf"^(\s*-\s*){old_esc}$", re.MULTILINE), rf"\g<1>{new_path}"), + ] + for target in targets: + if not target.exists(): + continue + try: + text = target.read_text() + except OSError: + continue + new_text = text + for pat, repl in link_patterns + related_patterns: + new_text = pat.sub(repl, new_text) + if new_text != text: + target.write_text(new_text) + count += 1 + return count + + +# --------------------------------------------------------------------------- +# Conversation refresh signals +# --------------------------------------------------------------------------- + + +def scan_conversation_references() -> dict[str, date]: + refs: dict[str, date] = {} + if not CONVERSATIONS_DIR.exists(): + return refs + + page_link_pattern = re.compile( + r"(?:patterns|decisions|concepts|environments)/[\w\-]+\.md" + ) + for project_dir in CONVERSATIONS_DIR.iterdir(): + if not project_dir.is_dir(): + continue + for md in project_dir.glob("*.md"): + page = parse_page(md) + if not page: + continue + if page.frontmatter.get("status") != "summarized": + continue + conv_date = parse_date(page.frontmatter.get("date")) + if not conv_date: + continue + related = page.frontmatter.get("related") or [] + if isinstance(related, list): + for ref in related: + m = page_link_pattern.search(str(ref)) + if m: + path = m.group(0) + if path not in refs or conv_date > refs[path]: + refs[path] = conv_date + for m in page_link_pattern.finditer(page.body): + path = m.group(0) + if path not in refs or conv_date > refs[path]: + refs[path] = conv_date + return refs + + +def apply_refresh_signals(refs: dict[str, date], dry_run: bool = False) -> list[tuple[Path, str, str, date]]: + changes: list[tuple[Path, str, str, date]] = [] + for page in iter_live_pages(): + rel = str(page.path.relative_to(WIKI_DIR)) + ref_date = refs.get(rel) + if not ref_date: + continue + current_verified = parse_date(page.frontmatter.get("last_verified")) + if current_verified and current_verified >= ref_date: + continue + old_conf = str(page.frontmatter.get("confidence", "medium")) + new_conf = bump_confidence(old_conf) if old_conf in ("low", "medium") else old_conf + changes.append((page.path, old_conf, new_conf, ref_date)) + if not dry_run: + page.frontmatter["last_verified"] = ref_date.isoformat() + if new_conf != old_conf: + page.frontmatter["confidence"] = new_conf + write_page(page) + return changes + + +# --------------------------------------------------------------------------- +# Auto-restoration +# --------------------------------------------------------------------------- + + +def auto_restore_archived(dry_run: bool = False) -> list[Path]: + restored: list[Path] = [] + archived = { + str(p.path.relative_to(ARCHIVE_DIR)): p + for p in iter_archived_pages() + if p.path.name != "index.md" + } + if not archived: + return restored + + referenced: set[str] = set() + scan_targets: list[Path] = [INDEX_FILE] + for sub in LIVE_CONTENT_DIRS: + scan_targets.extend((WIKI_DIR / sub).glob("*.md")) + if CONVERSATIONS_DIR.exists(): + for project_dir in CONVERSATIONS_DIR.iterdir(): + if project_dir.is_dir(): + scan_targets.extend(project_dir.glob("*.md")) + + for t in scan_targets: + try: + text = t.read_text() + except OSError: + continue + for rel_archive in archived: + if rel_archive in text or f"archive/{rel_archive}" in text: + referenced.add(rel_archive) + + for rel_archive, page in archived.items(): + if rel_archive in referenced: + restored_path = restore_page(page, dry_run=dry_run) + if restored_path: + restored.append(restored_path) + return restored + + +# --------------------------------------------------------------------------- +# Orphan detection +# --------------------------------------------------------------------------- + + +def find_orphan_pages() -> list[WikiPage]: + """Pages with no inbound link from index.md or any other wiki page.""" + all_pages = iter_live_pages() + all_text = [] + if INDEX_FILE.exists(): + all_text.append(INDEX_FILE.read_text()) + for p in all_pages: + all_text.append(p.path.read_text()) + combined = "\n".join(all_text) + + orphans: list[WikiPage] = [] + for page in all_pages: + rel = str(page.path.relative_to(WIKI_DIR)) + # A page that only appears in its own file isn't linked + own_count = page.path.read_text().count(rel) + total = combined.count(rel) + if total - own_count == 0: + orphans.append(page) + return orphans + + +def fix_orphan_page(page: WikiPage, dry_run: bool = False) -> bool: + """Add the page to index.md under its section. Returns True if fixed.""" + rel = str(page.path.relative_to(WIKI_DIR)) + title = str(page.frontmatter.get("title", page.path.stem)) + # Use first non-heading non-empty body line as summary + summary = "" + for line in page.body.strip().splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + summary = line[:120] + break + if dry_run: + print(f" [dry-run] add orphan to index: {rel}") + return True + _add_to_main_index(rel, title, summary) + return True + + +# --------------------------------------------------------------------------- +# Broken cross-references +# --------------------------------------------------------------------------- + + +LINK_REGEX = re.compile(r"\]\(((?:patterns|decisions|concepts|environments|archive)/[\w\-/]+\.md)\)") +RELATED_LINE_REGEX = re.compile(r"^\s*-\s*((?:patterns|decisions|concepts|environments)/[\w\-]+\.md)\s*$", re.MULTILINE) + + +def find_broken_cross_refs() -> list[tuple[Path, str, str | None]]: + """Return list of (page_path, bad_link, suggested_fix_or_None). + + `archived_paths` is keyed by the page's *original* live path (relative to + ARCHIVE_DIR, not WIKI_DIR) so we can directly check whether a broken live + link corresponds to an archived file at the same subpath. + """ + results: list[tuple[Path, str, str | None]] = [] + live_names = {str(p.path.relative_to(WIKI_DIR)) for p in iter_live_pages()} + archived_paths = {str(p.path.relative_to(ARCHIVE_DIR)) for p in iter_archived_pages()} + + scan: list[Path] = [INDEX_FILE] + for sub in LIVE_CONTENT_DIRS: + scan.extend((WIKI_DIR / sub).glob("*.md")) + + for target in scan: + try: + text = target.read_text() + except OSError: + continue + seen: set[str] = set() + for link in LINK_REGEX.findall(text): + if link in seen: + continue + seen.add(link) + if link in live_names: + continue + if link in archived_paths: + # Reference to archive → trigger restore + results.append((target, link, f"__RESTORE__:{link}")) + continue + # Fuzzy match + suggestion = fuzzy_find_page(link, live_names) + results.append((target, link, suggestion)) + # Also bare references in `related:` + for m in RELATED_LINE_REGEX.finditer(text): + link = m.group(1) + if link in seen or link in live_names: + continue + seen.add(link) + if link in archived_paths: + results.append((target, link, f"__RESTORE__:{link}")) + continue + results.append((target, link, fuzzy_find_page(link, live_names))) + return results + + +def fuzzy_find_page(bad_link: str, candidates: set[str]) -> str | None: + """Use difflib to find the closest valid page path.""" + matches = difflib.get_close_matches(bad_link, list(candidates), n=1, cutoff=0.75) + return matches[0] if matches else None + + +def fix_broken_cross_ref(target: Path, bad_link: str, suggested: str, dry_run: bool = False) -> bool: + if suggested.startswith("__RESTORE__:"): + archived_rel = suggested.split(":", 1)[1] + archived_page = parse_page(ARCHIVE_DIR / archived_rel) + if archived_page and not dry_run: + restore_page(archived_page) + return True + if dry_run: + print(f" [dry-run] fix {target.relative_to(WIKI_DIR)}: {bad_link} → {suggested}") + return True + text = target.read_text() + new_text = text.replace(f"]({bad_link})", f"]({suggested})") + new_text = re.sub( + rf"^(\s*-\s*){re.escape(bad_link)}$", + rf"\g<1>{suggested}", + new_text, + flags=re.MULTILINE, + ) + if new_text != text: + target.write_text(new_text) + return True + + +# --------------------------------------------------------------------------- +# Index drift +# --------------------------------------------------------------------------- + + +def find_index_drift() -> tuple[list[str], list[str]]: + """Return (missing_from_index, stale_index_entries).""" + disk_pages = {str(p.path.relative_to(WIKI_DIR)) for p in iter_live_pages()} + indexed: set[str] = set() + if INDEX_FILE.exists(): + for link in LINK_REGEX.findall(INDEX_FILE.read_text()): + indexed.add(link) + missing = sorted(disk_pages - indexed) + stale = sorted(indexed - disk_pages - {p for p in indexed if p.startswith("archive/")}) + return missing, stale + + +def fix_index_drift(missing: list[str], stale: list[str], dry_run: bool = False) -> None: + for rel in missing: + page = parse_page(WIKI_DIR / rel) + if not page: + continue + title = str(page.frontmatter.get("title", page.path.stem)) + summary = "" + for line in page.body.strip().splitlines(): + line = line.strip() + if line and not line.startswith("#"): + summary = line[:120] + break + if dry_run: + print(f" [dry-run] add to index: {rel}") + else: + _add_to_main_index(rel, title, summary) + for rel in stale: + if dry_run: + print(f" [dry-run] remove from index: {rel}") + else: + _remove_from_main_index(rel) + + +# --------------------------------------------------------------------------- +# Empty stubs +# --------------------------------------------------------------------------- + + +def find_empty_stubs() -> list[WikiPage]: + stubs: list[WikiPage] = [] + for page in iter_live_pages(): + body_text = re.sub(r"^#+\s+.*$", "", page.body, flags=re.MULTILINE).strip() + if len(body_text) < EMPTY_STUB_THRESHOLD: + stubs.append(page) + return stubs + + +# --------------------------------------------------------------------------- +# State drift +# --------------------------------------------------------------------------- + + +def find_state_drift() -> list[str]: + issues: list[str] = [] + + # .mine-state.json → output_file existence + if MINE_STATE_FILE.exists(): + try: + mine = json.load(open(MINE_STATE_FILE)) + for sid, info in mine.get("sessions", {}).items(): + out = info.get("output_file") + if out: + out_path = WIKI_DIR / out + if not out_path.exists(): + issues.append(f"mine: session {sid[:8]} references missing {out}") + except (OSError, json.JSONDecodeError) as e: + issues.append(f"mine: could not parse .mine-state.json ({e})") + + # .harvest-state.json → raw_file / wiki_pages existence + if HARVEST_STATE_FILE.exists(): + try: + harvest = json.load(open(HARVEST_STATE_FILE)) + for url, info in harvest.get("harvested_urls", {}).items(): + raw = info.get("raw_file") + if raw and not (WIKI_DIR / raw).exists(): + issues.append(f"harvest: {url[:60]} → missing raw file {raw}") + for wiki_page in info.get("wiki_pages", []): + if wiki_page and not (WIKI_DIR / wiki_page).exists(): + issues.append(f"harvest: {url[:60]} → missing wiki page {wiki_page}") + except (OSError, json.JSONDecodeError) as e: + issues.append(f"harvest: could not parse .harvest-state.json ({e})") + + # .hygiene-state.json → pages_checked existence + if HYGIENE_STATE_FILE.exists(): + try: + h = json.load(open(HYGIENE_STATE_FILE)) + for rel in h.get("pages_checked", {}): + if not (WIKI_DIR / rel).exists() and not (ARCHIVE_DIR / rel).exists(): + issues.append(f"hygiene: pages_checked references missing {rel}") + except (OSError, json.JSONDecodeError) as e: + issues.append(f"hygiene: could not parse .hygiene-state.json ({e})") + + return issues + + +# --------------------------------------------------------------------------- +# Staging / archive index sync +# --------------------------------------------------------------------------- + + +def sync_staging_index(dry_run: bool = False) -> bool: + """Regenerate staging/index.md from disk. Returns True if any change was needed.""" + if not STAGING_DIR.exists(): + return False + pending = [p for p in iter_staging_pages() if p.path.name != "index.md"] + expected = _build_staging_index(pending) + if STAGING_INDEX.exists(): + current = STAGING_INDEX.read_text() + if current == expected: + return False + if dry_run: + print(" [dry-run] staging/index.md would be regenerated") + return True + STAGING_DIR.mkdir(parents=True, exist_ok=True) + STAGING_INDEX.write_text(expected) + return True + + +def _build_staging_index(pending: list[WikiPage]) -> str: + lines = [ + "# Staging — Pending Wiki Content", + "", + "Content awaiting human review. These pages were generated by automated scripts", + "and need approval before joining the live wiki.", + "", + "**Review options**:", + "- Browse in Obsidian and move files manually (then run `scripts/wiki-staging.py --sync`)", + "- Run `python3 scripts/wiki-staging.py --list` for a summary", + "- Start a Claude session: \"let's review what's in staging\"", + "", + f"**{len(pending)} pending item(s)** as of {today().isoformat()}", + "", + "## Pending Items", + "", + ] + if not pending: + lines.append("_No pending items._") + else: + lines.append("| Page | Type | Source | Staged | Target |") + lines.append("|------|------|--------|--------|--------|") + for p in pending: + fm = p.frontmatter + title = fm.get("title", p.path.stem) + rel = str(p.path.relative_to(STAGING_DIR)) + ptype = fm.get("type", "unknown") + staged_by = fm.get("staged_by", "unknown") + staged = fm.get("staged_date", "—") + target = fm.get("target_path", rel) + lines.append(f"| [{title}]({rel}) | {ptype} | {staged_by} | {staged} | `{target}` |") + return "\n".join(lines) + "\n" + + +def sync_archive_index(dry_run: bool = False) -> bool: + """Rebuild archive/index.md from disk if out of sync. Returns True if changed.""" + if not ARCHIVE_DIR.exists(): + return False + archived = [p for p in iter_archived_pages() if p.path.name != "index.md"] + expected = _build_archive_index(archived) + if ARCHIVE_INDEX.exists(): + if ARCHIVE_INDEX.read_text() == expected: + return False + if dry_run: + print(" [dry-run] archive/index.md would be regenerated") + return True + ARCHIVE_INDEX.write_text(expected) + return True + + +def _build_archive_index(archived: list[WikiPage]) -> str: + lines = [ + "# Archived Wiki Pages", + "", + "Pages archived due to staleness or obsolescence. Excluded from default", + "wiki searches but available via `qmd search \"topic\" -c wiki-archive`.", + "", + "## Archived Pages", + "", + "| Page | Original Location | Archived | Reason |", + "|------|-------------------|----------|--------|", + ] + if not archived: + lines.append("| _(none yet)_ | | | |") + else: + for p in archived: + fm = p.frontmatter + name = p.path.stem.replace("-", " ").title() + rel = str(p.path.relative_to(ARCHIVE_DIR)) + original = fm.get("original_path", rel) + archived_date = fm.get("archived_date", "—") + reason = fm.get("archived_reason", "—") + lines.append(f"| [{name}]({rel}) | {original} | {archived_date} | {reason} |") + return "\n".join(lines) + "\n" + + +# --------------------------------------------------------------------------- +# LLM helpers (full mode) +# --------------------------------------------------------------------------- + + +def call_claude(prompt: str, model: str = CLAUDE_HAIKU) -> str | None: + try: + result = subprocess.run( + ["claude", "-p", "--model", model, "--output-format", "text", prompt], + capture_output=True, + text=True, + timeout=CLAUDE_TIMEOUT, + ) + except FileNotFoundError: + print(" [warn] claude CLI not found", file=sys.stderr) + return None + except subprocess.TimeoutExpired: + print(" [warn] claude -p timed out", file=sys.stderr) + return None + if result.returncode != 0: + print(f" [warn] claude -p failed: {result.stderr.strip()[:200]}", file=sys.stderr) + return None + return result.stdout.strip() + + +def _extract_json(text: str) -> Any: + match = re.search(r"(\{.*\}|\[.*\])", text, re.DOTALL) + if not match: + return None + try: + return json.loads(match.group(0)) + except json.JSONDecodeError: + return None + + +def _page_header_snippet(page: WikiPage) -> str: + """Short representation of a page for LLM prompts: rel path + title + first paragraph.""" + rel = str(page.path.relative_to(WIKI_DIR)) + title = str(page.frontmatter.get("title", page.path.stem)) + first_para = "" + for line in page.body.strip().splitlines(): + line = line.strip() + if line and not line.startswith("#"): + first_para = line[:400] + break + return f"`{rel}` — {title}\n{first_para}" + + +def find_missing_cross_refs_llm(pages: list[WikiPage]) -> list[tuple[Path, list[str]]]: + """For each page, ask haiku which other pages it should link to. + + Returns list of (page_path, suggested_rel_paths). + """ + if not pages: + return [] + # Use index.md as the catalog of candidates + try: + index_text = INDEX_FILE.read_text()[:10_000] + except OSError: + return [] + + results: list[tuple[Path, list[str]]] = [] + # Batch 5 pages per call + for i in range(0, len(pages), 5): + batch = pages[i : i + 5] + batch_text = "\n\n".join( + f"### PAGE {n+1}: {str(p.path.relative_to(WIKI_DIR))}\n" + f"title: {p.frontmatter.get('title', p.path.stem)}\n" + f"current related: {p.frontmatter.get('related', [])}\n" + f"first paragraph:\n{_page_header_snippet(p)}" + for n, p in enumerate(batch) + ) + prompt = ( + "You are reviewing wiki pages for missing cross-references. For each PAGE below, " + "identify OTHER wiki pages it should link to but currently doesn't. Only suggest " + "pages listed in the INDEX. Be conservative — only suggest strong topical matches.\n\n" + "Emit a single JSON object mapping the page's relative path to an array of relative " + "paths it should link to. Omit pages with no suggestions. No prose.\n\n" + f"### INDEX\n{index_text}\n\n" + f"### PAGES TO REVIEW\n{batch_text}\n" + ) + raw = call_claude(prompt, model=CLAUDE_HAIKU) + if not raw: + continue + data = _extract_json(raw) + if not isinstance(data, dict): + continue + for p in batch: + rel = str(p.path.relative_to(WIKI_DIR)) + suggestions = data.get(rel) + if isinstance(suggestions, list) and suggestions: + # Filter out pages already in related + existing = set(str(x) for x in (p.frontmatter.get("related") or [])) + new = [s for s in suggestions if s not in existing and s != rel] + if new: + results.append((p.path, new)) + return results + + +def find_duplicates_llm(pages: list[WikiPage]) -> list[tuple[Path, Path, str]]: + """First pass (no LLM) groups by keyword overlap; second pass (sonnet) confirms duplicates. + + Returns list of (weaker_path, stronger_path, reason). + """ + if len(pages) < 2: + return [] + + # Group pages by type + by_type: dict[str, list[WikiPage]] = {} + for p in pages: + t = str(p.frontmatter.get("type", "")) + by_type.setdefault(t, []).append(p) + + candidates: list[tuple[WikiPage, WikiPage]] = [] + for type_pages in by_type.values(): + for i, a in enumerate(type_pages): + a_words = _title_keywords(a) + for b in type_pages[i + 1 :]: + overlap = a_words & _title_keywords(b) + if len(overlap) >= 2: + candidates.append((a, b)) + + results: list[tuple[Path, Path, str]] = [] + for a, b in candidates[:10]: # cap to control LLM cost + prompt = ( + "Are these two wiki pages duplicates (substantially the same topic)?\n\n" + f"### PAGE A: {a.path.relative_to(WIKI_DIR)}\n{a.body[:3000]}\n\n" + f"### PAGE B: {b.path.relative_to(WIKI_DIR)}\n{b.body[:3000]}\n\n" + "Emit a single JSON object: " + '{\"duplicate\": true|false, \"stronger\": \"A\"|\"B\", \"reason\": \"...\"}. ' + "No prose." + ) + raw = call_claude(prompt, model=CLAUDE_SONNET) + data = _extract_json(raw or "") + if isinstance(data, dict) and data.get("duplicate"): + stronger = data.get("stronger", "A") + reason = str(data.get("reason", "")) + if stronger == "A": + results.append((b.path, a.path, reason)) + else: + results.append((a.path, b.path, reason)) + return results + + +def _title_keywords(page: WikiPage) -> set[str]: + title = str(page.frontmatter.get("title", page.path.stem)).lower() + return {w for w in re.split(r"[^a-z0-9]+", title) if len(w) > 3} + + +def find_contradictions_llm(pages: list[WikiPage]) -> list[tuple[Path, Path, str]]: + """Report-only — pair up related pages and ask sonnet to find conflicting claims.""" + # Focus on decisions/ and patterns/ + focus = [p for p in pages if str(p.frontmatter.get("type")) in ("decision", "pattern")] + if len(focus) < 2: + return [] + + # Build candidate pairs from shared related: links + by_path = {str(p.path.relative_to(WIKI_DIR)): p for p in focus} + candidates: list[tuple[WikiPage, WikiPage]] = [] + seen_pairs: set[tuple[str, str]] = set() + for p in focus: + related = p.frontmatter.get("related") or [] + if not isinstance(related, list): + continue + for rel_link in related: + other = by_path.get(str(rel_link)) + if not other: + continue + key = tuple(sorted([str(p.path), str(other.path)])) + if key in seen_pairs: + continue + seen_pairs.add(key) + candidates.append((p, other)) + + results: list[tuple[Path, Path, str]] = [] + for a, b in candidates[:8]: # cap + prompt = ( + "Compare these two wiki pages for contradictions in their claims or recommendations. " + "Only flag genuine contradictions, not complementary content.\n\n" + f"### PAGE A: {a.path.relative_to(WIKI_DIR)}\n{a.body[:3000]}\n\n" + f"### PAGE B: {b.path.relative_to(WIKI_DIR)}\n{b.body[:3000]}\n\n" + "Emit a single JSON object: " + '{\"contradiction\": true|false, \"description\": \"...\"}. No prose.' + ) + raw = call_claude(prompt, model=CLAUDE_SONNET) + data = _extract_json(raw or "") + if isinstance(data, dict) and data.get("contradiction"): + results.append((a.path, b.path, str(data.get("description", "")))) + return results + + +def find_tech_lifecycle_issues() -> list[tuple[Path, str]]: + """Flag pages mentioning outdated versions when newer ones appear in recent conversations.""" + page_versions: dict[Path, dict[str, str]] = {} + for page in iter_live_pages(): + versions = {} + for m in VERSION_REGEX.finditer(page.body): + tool = m.group(0).split()[0].lower() + versions[tool] = m.group(1) + if versions: + page_versions[page.path] = versions + + if not CONVERSATIONS_DIR.exists(): + return [] + + # Scan recent conversations (last 90 days) + recent_versions: dict[str, str] = {} + cutoff = today() - __import__("datetime").timedelta(days=90) + for project_dir in CONVERSATIONS_DIR.iterdir(): + if not project_dir.is_dir(): + continue + for md in project_dir.glob("*.md"): + page = parse_page(md) + if not page: + continue + d = parse_date(page.frontmatter.get("date")) + if not d or d < cutoff: + continue + for m in VERSION_REGEX.finditer(page.body): + tool = m.group(0).split()[0].lower() + ver = m.group(1) + if tool not in recent_versions or _version_gt(ver, recent_versions[tool]): + recent_versions[tool] = ver + + results: list[tuple[Path, str]] = [] + for path, versions in page_versions.items(): + for tool, page_ver in versions.items(): + recent = recent_versions.get(tool) + if recent and _version_gt(recent, page_ver): + results.append((path, f"{tool} {page_ver} in page; {recent} in recent conversations")) + break # one flag per page is enough + return results + + +def _version_gt(a: str, b: str) -> bool: + try: + ap = [int(x) for x in a.split(".")] + bp = [int(x) for x in b.split(".")] + return ap > bp + except ValueError: + return False + + +# --------------------------------------------------------------------------- +# Reports +# --------------------------------------------------------------------------- + + +@dataclass +class HygieneReport: + # Quick-mode fields + backfilled: list[tuple[Path, str, date]] = field(default_factory=list) + refreshed: list[tuple[Path, str, str, date]] = field(default_factory=list) + decayed: list[tuple[Path, str, str]] = field(default_factory=list) + archived: list[tuple[Path, str]] = field(default_factory=list) + restored: list[Path] = field(default_factory=list) + frontmatter_fixes: list[tuple[Path, list[str]]] = field(default_factory=list) + orphans_fixed: list[Path] = field(default_factory=list) + orphans_unfixed: list[Path] = field(default_factory=list) + xrefs_fixed: list[tuple[Path, str, str]] = field(default_factory=list) + xrefs_unfixed: list[tuple[Path, str]] = field(default_factory=list) + index_drift_added: list[str] = field(default_factory=list) + index_drift_removed: list[str] = field(default_factory=list) + staging_synced: bool = False + archive_synced: bool = False + # Report-only + empty_stubs: list[Path] = field(default_factory=list) + state_drift: list[str] = field(default_factory=list) + # Full-mode fields + missing_xrefs: list[tuple[Path, list[str]]] = field(default_factory=list) + duplicates: list[tuple[Path, Path, str]] = field(default_factory=list) + contradictions: list[tuple[Path, Path, str]] = field(default_factory=list) + tech_lifecycle: list[tuple[Path, str]] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Hygiene orchestrator +# --------------------------------------------------------------------------- + + +def run_quick_hygiene(dry_run: bool = False, check_only: bool = False) -> HygieneReport: + report = HygieneReport() + apply = not (dry_run or check_only) + + print("[quick] backfilling missing last_verified") + report.backfilled = backfill_last_verified(dry_run=not apply) + + print("[quick] scanning conversation references") + refs = scan_conversation_references() + report.refreshed = apply_refresh_signals(refs, dry_run=not apply) + + print("[quick] auto-restoring archived pages referenced again") + report.restored = auto_restore_archived(dry_run=not apply) + + print("[quick] repairing frontmatter") + report.frontmatter_fixes = repair_frontmatter(dry_run=not apply) + + print("[quick] applying confidence decay") + for page in iter_live_pages(): + current = str(page.frontmatter.get("confidence", "medium")) + last_verified = parse_date(page.frontmatter.get("last_verified")) + is_superseded = bool(re.search(r"superseded by", str(page.frontmatter.get("status", "")), re.IGNORECASE)) + expected = expected_confidence(current, last_verified, is_superseded) + if expected != current: + report.decayed.append((page.path, current, expected)) + if apply: + page.frontmatter["confidence"] = expected + write_page(page) + + print("[quick] archiving stale and superseded pages") + for page in iter_live_pages(): + conf = str(page.frontmatter.get("confidence", "medium")) + status_val = str(page.frontmatter.get("status", "")) + is_superseded = bool(re.search(r"superseded by", status_val, re.IGNORECASE)) + last_verified = parse_date(page.frontmatter.get("last_verified")) + if is_superseded: + reason = "Explicitly superseded" + if apply: + archive_page(page, reason) + report.archived.append((page.path, reason)) + continue + if conf == "stale": + days = (today() - last_verified).days if last_verified else -1 + reason = f"Confidence decayed to stale — no references in {days} days" + if apply: + archive_page(page, reason) + report.archived.append((page.path, reason)) + + print("[quick] checking index drift") + missing, stale_entries = find_index_drift() + report.index_drift_added = missing + report.index_drift_removed = stale_entries + if apply and (missing or stale_entries): + fix_index_drift(missing, stale_entries) + + print("[quick] checking for orphan pages") + orphans = find_orphan_pages() + for o in orphans: + if apply: + fix_orphan_page(o) + report.orphans_fixed.append(o.path) + else: + report.orphans_unfixed.append(o.path) + + print("[quick] checking for broken cross-references") + broken = find_broken_cross_refs() + for target, bad, suggested in broken: + if suggested is None: + report.xrefs_unfixed.append((target, bad)) + else: + if apply: + fix_broken_cross_ref(target, bad, suggested) + report.xrefs_fixed.append((target, bad, suggested)) + + print("[quick] checking for empty stubs") + report.empty_stubs = [p.path for p in find_empty_stubs()] + + print("[quick] checking state drift") + report.state_drift = find_state_drift() + + print("[quick] syncing staging/archive indexes") + report.staging_synced = sync_staging_index(dry_run=not apply) + report.archive_synced = sync_archive_index(dry_run=not apply) + + # Update hygiene state + if apply: + state = load_hygiene_state() + state["last_quick_run"] = datetime.now(timezone.utc).isoformat() + for page in iter_live_pages(): + mark_page_checked(state, page, "quick") + save_hygiene_state(state) + + return report + + +def run_full_hygiene(dry_run: bool = False, check_only: bool = False) -> HygieneReport: + """Quick hygiene + LLM-powered checks.""" + print("[full] running quick hygiene first") + report = run_quick_hygiene(dry_run=dry_run, check_only=check_only) + + apply = not (dry_run or check_only) + + # Only check pages that changed since last full run + state = load_hygiene_state() + all_pages = iter_live_pages() + changed_pages = [p for p in all_pages if page_changed_since(state, p, "full")] + print(f"[full] {len(changed_pages)}/{len(all_pages)} pages changed since last full run") + + print("[full] checking missing cross-references (haiku)") + report.missing_xrefs = find_missing_cross_refs_llm(changed_pages) + if apply: + for path, suggestions in report.missing_xrefs: + page = parse_page(path) + if not page: + continue + existing = list(page.frontmatter.get("related") or []) + for s in suggestions: + if s not in existing: + existing.append(s) + page.frontmatter["related"] = existing + write_page(page) + + print("[full] checking for duplicate coverage (sonnet)") + report.duplicates = find_duplicates_llm(all_pages) + if apply: + for weaker, stronger, reason in report.duplicates: + wp = parse_page(weaker) + if wp: + archive_page(wp, f"Merged into {stronger.relative_to(WIKI_DIR)} — {reason}") + + print("[full] checking for contradictions (sonnet) — report-only") + report.contradictions = find_contradictions_llm(all_pages) + + print("[full] checking technology lifecycle") + report.tech_lifecycle = find_tech_lifecycle_issues() + + if apply: + state["last_full_run"] = datetime.now(timezone.utc).isoformat() + for page in iter_live_pages(): + mark_page_checked(state, page, "full") + save_hygiene_state(state) + + return report + + +# --------------------------------------------------------------------------- +# Report writers +# --------------------------------------------------------------------------- + + +def write_fixed_report(report: HygieneReport, mode: str, dry_run: bool) -> Path: + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + suffix = "-dry-run" if dry_run else "-fixed" + path = REPORTS_DIR / f"hygiene-{today().isoformat()}{suffix}.md" + + lines = [ + f"# Hygiene Report — Auto-Fixed ({today().isoformat()})", + "", + f"Mode: {mode}{' (dry-run)' if dry_run else ''}", + "", + "## Summary", + "", + f"- Backfilled last_verified: {len(report.backfilled)}", + f"- Refreshed from conversations: {len(report.refreshed)}", + f"- Frontmatter repairs: {len(report.frontmatter_fixes)}", + f"- Orphans linked: {len(report.orphans_fixed)}", + f"- Broken xrefs fixed: {len(report.xrefs_fixed)}", + f"- Index drift — added: {len(report.index_drift_added)}, removed: {len(report.index_drift_removed)}", + f"- Decayed: {len(report.decayed)}", + f"- Archived: {len(report.archived)}", + f"- Restored: {len(report.restored)}", + f"- Staging index resynced: {report.staging_synced}", + f"- Archive index resynced: {report.archive_synced}", + ] + if mode == "full": + lines.extend([ + f"- Missing xrefs added: {len(report.missing_xrefs)}", + f"- Duplicates merged: {len(report.duplicates)}", + ]) + lines.append("") + + def _section(title: str, rows: list[str]) -> None: + if not rows: + return + lines.append(f"## {title}") + lines.append("") + lines.extend(rows) + lines.append("") + + _section( + "Backfilled last_verified", + [f"- `{p.relative_to(WIKI_DIR)}` ← {src} ({d.isoformat()})" for p, src, d in report.backfilled], + ) + _section( + "Refreshed from conversations", + [ + f"- `{p.relative_to(WIKI_DIR)}` confidence {old} → {new} (ref {d.isoformat()})" + for p, old, new, d in report.refreshed + ], + ) + _section( + "Frontmatter repairs", + [f"- `{p.relative_to(WIKI_DIR)}` — added: {', '.join(fields)}" for p, fields in report.frontmatter_fixes], + ) + _section( + "Orphans linked", + [f"- `{p.relative_to(WIKI_DIR)}`" for p in report.orphans_fixed], + ) + _section( + "Broken xrefs fixed", + [f"- `{t.relative_to(WIKI_DIR)}` {bad} → {new}" for t, bad, new in report.xrefs_fixed], + ) + _section( + "Index drift — added", + [f"- `{p}`" for p in report.index_drift_added], + ) + _section( + "Index drift — removed", + [f"- `{p}`" for p in report.index_drift_removed], + ) + _section( + "Confidence decayed", + [f"- `{p.relative_to(WIKI_DIR)}` {old} → {new}" for p, old, new in report.decayed], + ) + _section( + "Archived", + [f"- `{p.relative_to(WIKI_DIR)}` — {reason}" for p, reason in report.archived], + ) + _section( + "Restored", + [f"- `{p.relative_to(WIKI_DIR)}`" for p in report.restored], + ) + if mode == "full": + _section( + "Missing xrefs added", + [ + f"- `{p.relative_to(WIKI_DIR)}` ← added: {', '.join(s)}" + for p, s in report.missing_xrefs + ], + ) + _section( + "Duplicates merged", + [ + f"- `{w.relative_to(WIKI_DIR)}` → merged into `{s.relative_to(WIKI_DIR)}` ({r})" + for w, s, r in report.duplicates + ], + ) + + path.write_text("\n".join(lines) + "\n") + return path + + +def write_needs_review_report(report: HygieneReport, mode: str) -> Path | None: + """Write needs-review report if there's anything to review. Returns path or None.""" + items: list[str] = [] + + if report.orphans_unfixed: + items.append("## Orphan pages (no inbound links)") + items.append("") + items.extend(f"- `{p.relative_to(WIKI_DIR)}`" for p in report.orphans_unfixed) + items.append("") + + if report.xrefs_unfixed: + items.append("## Broken cross-references (no fuzzy match)") + items.append("") + items.extend( + f"- `{t.relative_to(WIKI_DIR)}` → missing link `{bad}`" + for t, bad in report.xrefs_unfixed + ) + items.append("") + + if report.empty_stubs: + items.append("## Empty stubs (body < 100 chars)") + items.append("") + items.extend(f"- `{p.relative_to(WIKI_DIR)}`" for p in report.empty_stubs) + items.append("") + + if report.state_drift: + items.append("## State file drift") + items.append("") + items.extend(f"- {msg}" for msg in report.state_drift) + items.append("") + + if mode == "full": + if report.contradictions: + items.append("## Contradictions (LLM-detected — human judgment required)") + items.append("") + for a, b, desc in report.contradictions: + items.append(f"### `{a.relative_to(WIKI_DIR)}` vs `{b.relative_to(WIKI_DIR)}`") + items.append("") + items.append(desc) + items.append("") + if report.tech_lifecycle: + items.append("## Technology lifecycle flags") + items.append("") + items.extend( + f"- `{p.relative_to(WIKI_DIR)}` — {note}" + for p, note in report.tech_lifecycle + ) + items.append("") + + if not items: + return None + + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + path = REPORTS_DIR / f"hygiene-{today().isoformat()}-needs-review.md" + header = [ + f"# Hygiene Report — Needs Review ({today().isoformat()})", + "", + f"Mode: {mode}", + f"Items requiring attention: {sum(1 for line in items if line.startswith(('## ', '### ')))}", + "", + ] + path.write_text("\n".join(header + items) + "\n") + return path + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser(description="Wiki hygiene — quick and full modes") + mode = parser.add_mutually_exclusive_group() + mode.add_argument("--quick", action="store_true", help="Run the quick hygiene loop (default)") + mode.add_argument("--full", action="store_true", help="Run full hygiene (quick + LLM checks)") + mode.add_argument("--backfill", action="store_true", help="Only run the last_verified backfill") + mode.add_argument("--scan-refs", action="store_true", help="Only apply conversation refresh signals") + mode.add_argument("--archive", metavar="PATH", help="Manually archive a live page") + mode.add_argument("--restore", metavar="PATH", help="Manually restore an archived page") + parser.add_argument("--dry-run", action="store_true", help="Show what would change without writing") + parser.add_argument("--check-only", action="store_true", help="Report issues without auto-fixing") + parser.add_argument("--reason", default="Manual archive", help="Reason for --archive") + args = parser.parse_args() + + if args.backfill: + changes = backfill_last_verified(dry_run=args.dry_run) + for p, src, d in changes: + print(f" {p.relative_to(WIKI_DIR)} ← {src} ({d.isoformat()})") + print(f"\n{len(changes)} page(s) backfilled") + return 0 + + if args.scan_refs: + refs = scan_conversation_references() + print(f"Found references to {len(refs)} wiki page(s)") + changes = apply_refresh_signals(refs, dry_run=args.dry_run) + for p, old, new, d in changes: + print(f" {p.relative_to(WIKI_DIR)} {old}→{new} ({d.isoformat()})") + print(f"\n{len(changes)} page(s) refreshed") + return 0 + + if args.archive: + path = Path(args.archive) + if not path.is_absolute(): + path = WIKI_DIR / path + page = parse_page(path) + if not page: + print(f"Cannot parse page: {path}", file=sys.stderr) + return 1 + archive_page(page, args.reason, dry_run=args.dry_run) + return 0 + + if args.restore: + path = Path(args.restore) + if not path.is_absolute(): + path = WIKI_DIR / path + page = parse_page(path) + if not page: + print(f"Cannot parse page: {path}", file=sys.stderr) + return 1 + restore_page(page, dry_run=args.dry_run) + return 0 + + # Default: quick or full hygiene loop + mode_name = "full" if args.full else "quick" + if args.full: + report = run_full_hygiene(dry_run=args.dry_run, check_only=args.check_only) + else: + report = run_quick_hygiene(dry_run=args.dry_run, check_only=args.check_only) + + fixed_path = write_fixed_report(report, mode_name, args.dry_run) + review_path = write_needs_review_report(report, mode_name) + + print(f"\nFixed report: {fixed_path.relative_to(WIKI_DIR)}") + if review_path: + print(f"Needs-review report: {review_path.relative_to(WIKI_DIR)}") + else: + print("No items need human review.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/wiki-maintain.sh b/scripts/wiki-maintain.sh new file mode 100755 index 0000000..d00ebc3 --- /dev/null +++ b/scripts/wiki-maintain.sh @@ -0,0 +1,198 @@ +#!/usr/bin/env bash +set -euo pipefail + +# wiki-maintain.sh — Top-level orchestrator for wiki maintenance. +# +# Chains the three maintenance scripts in the correct order: +# 1. wiki-harvest.py (URL harvesting from summarized conversations) +# 2. wiki-hygiene.py (quick or full hygiene checks) +# 3. qmd update && qmd embed (reindex after changes) +# +# Usage: +# wiki-maintain.sh # Harvest + quick hygiene +# wiki-maintain.sh --full # Harvest + full hygiene (LLM-powered) +# wiki-maintain.sh --harvest-only # URL harvesting only +# wiki-maintain.sh --hygiene-only # Quick hygiene only +# wiki-maintain.sh --hygiene-only --full # Full hygiene only +# wiki-maintain.sh --dry-run # Show what would run (no writes) +# wiki-maintain.sh --no-compile # Harvest without claude -p compilation step +# wiki-maintain.sh --no-reindex # Skip qmd update/embed after +# +# Log file: scripts/.maintain.log (rotated manually) + +# Resolve script location first so we can find sibling scripts regardless of +# how WIKI_DIR is set. WIKI_DIR defaults to the parent of scripts/ but may be +# overridden for tests or alternate installs. +SCRIPTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WIKI_DIR="${WIKI_DIR:-$(dirname "${SCRIPTS_DIR}")}" +LOG_FILE="${SCRIPTS_DIR}/.maintain.log" + +# ----------------------------------------------------------------------------- +# Argument parsing +# ----------------------------------------------------------------------------- + +FULL_MODE=false +HARVEST_ONLY=false +HYGIENE_ONLY=false +DRY_RUN=false +NO_COMPILE=false +NO_REINDEX=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --full) FULL_MODE=true; shift ;; + --harvest-only) HARVEST_ONLY=true; shift ;; + --hygiene-only) HYGIENE_ONLY=true; shift ;; + --dry-run) DRY_RUN=true; shift ;; + --no-compile) NO_COMPILE=true; shift ;; + --no-reindex) NO_REINDEX=true; shift ;; + -h|--help) + sed -n '3,20p' "$0" | sed 's/^# \?//' + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +if [[ "${HARVEST_ONLY}" == "true" && "${HYGIENE_ONLY}" == "true" ]]; then + echo "--harvest-only and --hygiene-only are mutually exclusive" >&2 + exit 1 +fi + +# ----------------------------------------------------------------------------- +# Logging +# ----------------------------------------------------------------------------- + +log() { + local ts + ts="$(date '+%Y-%m-%d %H:%M:%S')" + printf '[%s] %s\n' "${ts}" "$*" +} + +section() { + echo "" + log "━━━ $* ━━━" +} + +# ----------------------------------------------------------------------------- +# Sanity checks +# ----------------------------------------------------------------------------- + +if [[ ! -d "${WIKI_DIR}" ]]; then + echo "Wiki directory not found: ${WIKI_DIR}" >&2 + exit 1 +fi + +cd "${WIKI_DIR}" + +for req in python3 qmd; do + if ! command -v "${req}" >/dev/null 2>&1; then + if [[ "${req}" == "qmd" && "${NO_REINDEX}" == "true" ]]; then + continue # qmd not required if --no-reindex + fi + echo "Required command not found: ${req}" >&2 + exit 1 + fi +done + +# ----------------------------------------------------------------------------- +# Pipeline +# ----------------------------------------------------------------------------- + +START_TS="$(date '+%s')" +section "wiki-maintain.sh starting" +log "mode: $(${FULL_MODE} && echo full || echo quick)" +log "harvest: $(${HYGIENE_ONLY} && echo skipped || echo enabled)" +log "hygiene: $(${HARVEST_ONLY} && echo skipped || echo enabled)" +log "reindex: $(${NO_REINDEX} && echo skipped || echo enabled)" +log "dry-run: ${DRY_RUN}" +log "wiki: ${WIKI_DIR}" + +# ----------------------------------------------------------------------------- +# Phase 1: Harvest +# ----------------------------------------------------------------------------- + +if [[ "${HYGIENE_ONLY}" != "true" ]]; then + section "Phase 1: URL harvesting" + harvest_args=() + ${DRY_RUN} && harvest_args+=(--dry-run) + ${NO_COMPILE} && harvest_args+=(--no-compile) + + if python3 "${SCRIPTS_DIR}/wiki-harvest.py" "${harvest_args[@]}"; then + log "harvest completed" + else + log "[error] harvest failed (exit $?) — continuing to hygiene" + fi +else + section "Phase 1: URL harvesting (skipped)" +fi + +# ----------------------------------------------------------------------------- +# Phase 2: Hygiene +# ----------------------------------------------------------------------------- + +if [[ "${HARVEST_ONLY}" != "true" ]]; then + section "Phase 2: Hygiene checks" + hygiene_args=() + if ${FULL_MODE}; then + hygiene_args+=(--full) + fi + ${DRY_RUN} && hygiene_args+=(--dry-run) + + if python3 "${SCRIPTS_DIR}/wiki-hygiene.py" "${hygiene_args[@]}"; then + log "hygiene completed" + else + log "[error] hygiene failed (exit $?) — continuing to reindex" + fi +else + section "Phase 2: Hygiene checks (skipped)" +fi + +# ----------------------------------------------------------------------------- +# Phase 3: qmd reindex +# ----------------------------------------------------------------------------- + +if [[ "${NO_REINDEX}" != "true" && "${DRY_RUN}" != "true" ]]; then + section "Phase 3: qmd reindex" + + if qmd update 2>&1 | sed 's/^/ /'; then + log "qmd update completed" + else + log "[error] qmd update failed (exit $?)" + fi + + if qmd embed 2>&1 | sed 's/^/ /'; then + log "qmd embed completed" + else + log "[warn] qmd embed failed or produced warnings" + fi +else + section "Phase 3: qmd reindex (skipped)" +fi + +# ----------------------------------------------------------------------------- +# Summary +# ----------------------------------------------------------------------------- + +END_TS="$(date '+%s')" +DURATION=$((END_TS - START_TS)) +section "wiki-maintain.sh finished in ${DURATION}s" + +# Report the most recent hygiene reports, if any. Use `if` statements (not +# `[[ ]] && action`) because under `set -e` a false test at end-of-script +# becomes the process exit status. +if [[ -d "${WIKI_DIR}/reports" ]]; then + latest_fixed="$(ls -t "${WIKI_DIR}"/reports/hygiene-*-fixed.md 2>/dev/null | head -n 1 || true)" + latest_review="$(ls -t "${WIKI_DIR}"/reports/hygiene-*-needs-review.md 2>/dev/null | head -n 1 || true)" + if [[ -n "${latest_fixed}" ]]; then + log "latest fixed report: $(basename "${latest_fixed}")" + fi + if [[ -n "${latest_review}" ]]; then + log "latest review report: $(basename "${latest_review}")" + fi +fi + +exit 0 diff --git a/scripts/wiki-staging.py b/scripts/wiki-staging.py new file mode 100755 index 0000000..46a0851 --- /dev/null +++ b/scripts/wiki-staging.py @@ -0,0 +1,639 @@ +#!/usr/bin/env python3 +"""Human-in-the-loop staging pipeline for wiki content. + +Pure file operations — no LLM calls. Moves pages between staging/ and the live +wiki, updates indexes, rewrites cross-references, and tracks rejections in +.harvest-state.json. + +Usage: + python3 scripts/wiki-staging.py --list # List pending items + python3 scripts/wiki-staging.py --list --json # JSON output + python3 scripts/wiki-staging.py --stats # Summary by type and age + python3 scripts/wiki-staging.py --promote PATH # Approve one page + python3 scripts/wiki-staging.py --reject PATH --reason "..." # Reject with reason + python3 scripts/wiki-staging.py --promote-all # Approve everything + python3 scripts/wiki-staging.py --review # Interactive approval loop + python3 scripts/wiki-staging.py --sync # Rebuild staging/index.md + +PATH may be relative to the wiki root (e.g. `staging/patterns/foo.md`) or absolute. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from datetime import date +from pathlib import Path +from typing import Any + +# Import shared helpers +sys.path.insert(0, str(Path(__file__).parent)) +from wiki_lib import ( # noqa: E402 + ARCHIVE_DIR, + CONVERSATIONS_DIR, + HARVEST_STATE_FILE, + INDEX_FILE, + LIVE_CONTENT_DIRS, + REPORTS_DIR, + STAGING_DIR, + STAGING_INDEX, + WIKI_DIR, + WikiPage, + iter_live_pages, + iter_staging_pages, + parse_date, + parse_page, + today, + write_page, +) + +sys.stdout.reconfigure(line_buffering=True) +sys.stderr.reconfigure(line_buffering=True) + +# Fields stripped from frontmatter on promotion (staging-only metadata) +STAGING_ONLY_FIELDS = [ + "status", + "staged_date", + "staged_by", + "target_path", + "modifies", + "compilation_notes", +] + +# --------------------------------------------------------------------------- +# Discovery +# --------------------------------------------------------------------------- + + +def list_pending() -> list[WikiPage]: + pages = [p for p in iter_staging_pages() if p.path.name != "index.md"] + return pages + + +def page_summary(page: WikiPage) -> dict[str, Any]: + rel = str(page.path.relative_to(WIKI_DIR)) + fm = page.frontmatter + target = fm.get("target_path") or _infer_target_path(page) + staged = parse_date(fm.get("staged_date")) + age = (today() - staged).days if staged else None + return { + "path": rel, + "title": fm.get("title", page.path.stem), + "type": fm.get("type", _infer_type(page)), + "status": fm.get("status", "pending"), + "origin": fm.get("origin", "automated"), + "staged_by": fm.get("staged_by", "unknown"), + "staged_date": str(staged) if staged else None, + "age_days": age, + "target_path": target, + "modifies": fm.get("modifies"), + "compilation_notes": fm.get("compilation_notes", ""), + } + + +def _infer_target_path(page: WikiPage) -> str: + """Derive a target path when target_path isn't set in frontmatter.""" + try: + rel = page.path.relative_to(STAGING_DIR) + except ValueError: + return str(page.path.relative_to(WIKI_DIR)) + return str(rel) + + +def _infer_type(page: WikiPage) -> str: + """Infer type from the directory name when frontmatter doesn't specify it.""" + parts = page.path.relative_to(STAGING_DIR).parts + if len(parts) >= 2 and parts[0] in LIVE_CONTENT_DIRS: + return parts[0].rstrip("s") # 'patterns' → 'pattern' + return "unknown" + + +# --------------------------------------------------------------------------- +# Main index update +# --------------------------------------------------------------------------- + + +def _remove_from_main_index(rel_path: str) -> None: + if not INDEX_FILE.exists(): + return + text = INDEX_FILE.read_text() + lines = text.splitlines(keepends=True) + pattern = re.compile(rf"^- \[.+\]\({re.escape(rel_path)}\) ") + new_lines = [line for line in lines if not pattern.match(line)] + if len(new_lines) != len(lines): + INDEX_FILE.write_text("".join(new_lines)) + + +def _add_to_main_index(rel_path: str, title: str, summary: str = "") -> None: + """Append a new entry under the appropriate section. Best-effort — operator may re-order later.""" + if not INDEX_FILE.exists(): + return + text = INDEX_FILE.read_text() + # Avoid duplicates + if f"]({rel_path})" in text: + return + entry = f"- [{title}]({rel_path})" + if summary: + entry += f" — {summary}" + entry += "\n" + # Insert at the end of the first matching section + ptype = rel_path.split("/")[0] + section_headers = { + "patterns": "## Patterns", + "decisions": "## Decisions", + "concepts": "## Concepts", + "environments": "## Environments", + } + header = section_headers.get(ptype) + if header and header in text: + # Find the header and append before the next ## header or EOF + idx = text.find(header) + next_header = text.find("\n## ", idx + len(header)) + if next_header == -1: + next_header = len(text) + # Find the last non-empty line in the section + section = text[idx:next_header] + last_nl = section.rfind("\n", 0, len(section) - 1) + 1 + INDEX_FILE.write_text(text[: idx + last_nl] + entry + text[idx + last_nl :]) + else: + INDEX_FILE.write_text(text.rstrip() + "\n" + entry) + + +# --------------------------------------------------------------------------- +# Staging index update +# --------------------------------------------------------------------------- + + +def regenerate_staging_index() -> None: + STAGING_DIR.mkdir(parents=True, exist_ok=True) + pending = list_pending() + + lines = [ + "# Staging — Pending Wiki Content", + "", + "Content awaiting human review. These pages were generated by automated scripts", + "and need approval before joining the live wiki.", + "", + "**Review options**:", + "- Browse in Obsidian and move files manually (then run `scripts/wiki-staging.py --sync`)", + "- Run `python3 scripts/wiki-staging.py --list` for a summary", + "- Start a Claude session: \"let's review what's in staging\"", + "", + f"**{len(pending)} pending item(s)** as of {today().isoformat()}", + "", + "## Pending Items", + "", + ] + + if not pending: + lines.append("_No pending items._") + else: + lines.append("| Page | Type | Source | Staged | Age | Target |") + lines.append("|------|------|--------|--------|-----|--------|") + for page in pending: + s = page_summary(page) + title = s["title"] + rel_in_staging = str(page.path.relative_to(STAGING_DIR)) + age = f"{s['age_days']}d" if s["age_days"] is not None else "—" + staged = s["staged_date"] or "—" + lines.append( + f"| [{title}]({rel_in_staging}) | {s['type']} | " + f"{s['staged_by']} | {staged} | {age} | `{s['target_path']}` |" + ) + + STAGING_INDEX.write_text("\n".join(lines) + "\n") + + +# --------------------------------------------------------------------------- +# Cross-reference rewriting +# --------------------------------------------------------------------------- + + +def _rewrite_cross_references(old_path: str, new_path: str) -> int: + """Rewrite links and `related:` entries across the wiki.""" + targets: list[Path] = [INDEX_FILE] + for sub in LIVE_CONTENT_DIRS: + targets.extend((WIKI_DIR / sub).glob("*.md")) + if STAGING_DIR.exists(): + for sub in LIVE_CONTENT_DIRS: + targets.extend((STAGING_DIR / sub).glob("*.md")) + if ARCHIVE_DIR.exists(): + for sub in LIVE_CONTENT_DIRS: + targets.extend((ARCHIVE_DIR / sub).glob("*.md")) + + count = 0 + old_esc = re.escape(old_path) + link_patterns = [ + (re.compile(rf"\]\({old_esc}\)"), f"]({new_path})"), + (re.compile(rf"\]\(\.\./{old_esc}\)"), f"](../{new_path})"), + ] + related_patterns = [ + (re.compile(rf"^(\s*-\s*){old_esc}$", re.MULTILINE), rf"\g<1>{new_path}"), + ] + for target in targets: + if not target.exists(): + continue + try: + text = target.read_text() + except OSError: + continue + new_text = text + for pat, repl in link_patterns + related_patterns: + new_text = pat.sub(repl, new_text) + if new_text != text: + target.write_text(new_text) + count += 1 + return count + + +# --------------------------------------------------------------------------- +# Promote +# --------------------------------------------------------------------------- + + +def promote(page: WikiPage, dry_run: bool = False) -> Path | None: + summary = page_summary(page) + target_rel = summary["target_path"] + target_path = WIKI_DIR / target_rel + + modifies = summary["modifies"] + if modifies: + # This is an update to an existing page. Merge: keep staging content, + # preserve the live page's origin if it was manual. + live_path = WIKI_DIR / modifies + if not live_path.exists(): + print( + f" [warn] modifies target {modifies} does not exist — treating as new page", + file=sys.stderr, + ) + modifies = None + else: + live_page = parse_page(live_path) + if live_page: + # Warn if live page has been updated since staging + live_compiled = parse_date(live_page.frontmatter.get("last_compiled")) + staged = parse_date(page.frontmatter.get("staged_date")) + if live_compiled and staged and live_compiled > staged: + print( + f" [warn] live page {modifies} was updated ({live_compiled}) " + f"after staging ({staged}) — human should verify merge", + file=sys.stderr, + ) + # Preserve origin from live if it was manual + if live_page.frontmatter.get("origin") == "manual": + page.frontmatter["origin"] = "manual" + + rel_src = str(page.path.relative_to(WIKI_DIR)) + + if dry_run: + action = "update" if modifies else "new page" + print(f" [dry-run] promote {rel_src} → {target_rel} ({action})") + return target_path + + # Clean frontmatter — strip staging-only fields + new_fm = {k: v for k, v in page.frontmatter.items() if k not in STAGING_ONLY_FIELDS} + new_fm.setdefault("origin", "automated") + new_fm["last_verified"] = today().isoformat() + if "last_compiled" not in new_fm: + new_fm["last_compiled"] = today().isoformat() + + target_path.parent.mkdir(parents=True, exist_ok=True) + old_path = page.path + page.path = target_path + page.frontmatter = new_fm + write_page(page) + old_path.unlink() + + # Rewrite cross-references: staging/... → target_rel + rel_staging = str(old_path.relative_to(WIKI_DIR)) + _rewrite_cross_references(rel_staging, target_rel) + + # Update main index + summary_text = page.body.strip().splitlines()[0] if page.body.strip() else "" + _add_to_main_index(target_rel, new_fm.get("title", page.path.stem), summary_text[:120]) + + # Regenerate staging index + regenerate_staging_index() + + # Log to hygiene report (append a line) + _append_log(f"promote | {rel_staging} → {target_rel}" + (f" (modifies {modifies})" if modifies else "")) + return target_path + + +# --------------------------------------------------------------------------- +# Reject +# --------------------------------------------------------------------------- + + +def reject(page: WikiPage, reason: str, dry_run: bool = False) -> None: + rel = str(page.path.relative_to(WIKI_DIR)) + if dry_run: + print(f" [dry-run] reject {rel} — {reason}") + return + + # Record in harvest-state if this came from URL harvesting + _record_rejection_in_harvest_state(page, reason) + + # Delete the file + page.path.unlink() + + # Regenerate staging index + regenerate_staging_index() + + _append_log(f"reject | {rel} — {reason}") + print(f" [rejected] {rel}") + + +def _record_rejection_in_harvest_state(page: WikiPage, reason: str) -> None: + """If the staged page came from wiki-harvest, add the source URL to rejected_urls.""" + if not HARVEST_STATE_FILE.exists(): + return + # Look for the source URL in frontmatter (harvest_source) or in sources field + source_url = page.frontmatter.get("harvest_source") + if not source_url: + sources = page.frontmatter.get("sources") or [] + if isinstance(sources, list): + for src in sources: + src_str = str(src) + # If src is a raw/harvested/... file, look up its source_url + if "raw/harvested/" in src_str: + raw_path = WIKI_DIR / src_str + if raw_path.exists(): + raw_page = parse_page(raw_path) + if raw_page: + source_url = raw_page.frontmatter.get("source_url") + break + + if not source_url: + return + + try: + with open(HARVEST_STATE_FILE) as f: + state = json.load(f) + except (OSError, json.JSONDecodeError): + return + + state.setdefault("rejected_urls", {})[source_url] = { + "reason": reason, + "rejected_date": today().isoformat(), + } + # Remove from harvested_urls if present + state.get("harvested_urls", {}).pop(source_url, None) + + with open(HARVEST_STATE_FILE, "w") as f: + json.dump(state, f, indent=2, sort_keys=True) + + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- + + +def _append_log(line: str) -> None: + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + log = REPORTS_DIR / f"staging-{today().isoformat()}.log" + with open(log, "a") as f: + f.write(f"{line}\n") + + +# --------------------------------------------------------------------------- +# Path resolution +# --------------------------------------------------------------------------- + + +def resolve_page(raw_path: str) -> WikiPage | None: + path = Path(raw_path) + if not path.is_absolute(): + # Accept "staging/..." or just "patterns/foo.md" (assumes staging) + if not raw_path.startswith("staging/") and raw_path.split("/", 1)[0] in LIVE_CONTENT_DIRS: + path = STAGING_DIR / raw_path + else: + path = WIKI_DIR / raw_path + if not path.exists(): + print(f" [error] not found: {path}", file=sys.stderr) + return None + return parse_page(path) + + +# --------------------------------------------------------------------------- +# Commands +# --------------------------------------------------------------------------- + + +def cmd_list(as_json: bool = False) -> int: + pending = list_pending() + if as_json: + data = [page_summary(p) for p in pending] + print(json.dumps(data, indent=2)) + return 0 + + if not pending: + print("No pending items in staging.") + return 0 + + print(f"{len(pending)} pending item(s):\n") + for p in pending: + s = page_summary(p) + age = f"{s['age_days']}d" if s["age_days"] is not None else "—" + marker = " (update)" if s["modifies"] else "" + print(f" {s['path']}{marker}") + print(f" title: {s['title']}") + print(f" type: {s['type']}") + print(f" source: {s['staged_by']}") + print(f" staged: {s['staged_date']} ({age} old)") + print(f" target: {s['target_path']}") + if s["modifies"]: + print(f" modifies: {s['modifies']}") + if s["compilation_notes"]: + notes = s["compilation_notes"][:100] + print(f" notes: {notes}") + print() + return 0 + + +def cmd_stats() -> int: + pending = list_pending() + total = len(pending) + if total == 0: + print("No pending items in staging.") + return 0 + + by_type: dict[str, int] = {} + by_source: dict[str, int] = {} + ages: list[int] = [] + updates = 0 + + for p in pending: + s = page_summary(p) + by_type[s["type"]] = by_type.get(s["type"], 0) + 1 + by_source[s["staged_by"]] = by_source.get(s["staged_by"], 0) + 1 + if s["age_days"] is not None: + ages.append(s["age_days"]) + if s["modifies"]: + updates += 1 + + print(f"Total pending: {total}") + print(f"Updates (modifies existing): {updates}") + print(f"New pages: {total - updates}") + print() + print("By type:") + for t, n in sorted(by_type.items()): + print(f" {t}: {n}") + print() + print("By source:") + for s, n in sorted(by_source.items()): + print(f" {s}: {n}") + if ages: + print() + print(f"Age (days): min={min(ages)}, max={max(ages)}, avg={sum(ages)//len(ages)}") + return 0 + + +def cmd_promote(path_arg: str, dry_run: bool) -> int: + page = resolve_page(path_arg) + if not page: + return 1 + result = promote(page, dry_run=dry_run) + if result and not dry_run: + print(f" [promoted] {result.relative_to(WIKI_DIR)}") + return 0 + + +def cmd_reject(path_arg: str, reason: str, dry_run: bool) -> int: + page = resolve_page(path_arg) + if not page: + return 1 + reject(page, reason, dry_run=dry_run) + return 0 + + +def cmd_promote_all(dry_run: bool) -> int: + pending = list_pending() + if not pending: + print("No pending items.") + return 0 + print(f"Promoting {len(pending)} page(s)...") + for p in pending: + promote(p, dry_run=dry_run) + return 0 + + +def cmd_review() -> int: + """Interactive review loop. Prompts approve/reject/skip for each pending item.""" + pending = list_pending() + if not pending: + print("No pending items.") + return 0 + + print(f"Reviewing {len(pending)} pending item(s). (a)pprove / (r)eject / (s)kip / (q)uit\n") + for p in pending: + s = page_summary(p) + print(f"━━━ {s['path']} ━━━") + print(f" {s['title']} ({s['type']})") + print(f" from: {s['staged_by']} ({s['staged_date']})") + print(f" target: {s['target_path']}") + if s["modifies"]: + print(f" updates: {s['modifies']}") + if s["compilation_notes"]: + print(f" notes: {s['compilation_notes'][:150]}") + # Show first few lines of body + first_lines = [ln for ln in p.body.strip().splitlines() if ln.strip()][:3] + for ln in first_lines: + print(f" │ {ln[:100]}") + print() + + while True: + try: + answer = input(" [a/r/s/q] > ").strip().lower() + except EOFError: + return 0 + if answer in ("a", "approve"): + promote(p) + break + if answer in ("r", "reject"): + try: + reason = input(" reason > ").strip() + except EOFError: + return 0 + reject(p, reason or "no reason given") + break + if answer in ("s", "skip"): + break + if answer in ("q", "quit"): + return 0 + print() + return 0 + + +def cmd_sync() -> int: + """Reconcile staging index after manual operations (Obsidian moves, deletions). + + Also detects pages that were manually moved out of staging without going through + the promotion flow and reports them. + """ + print("Regenerating staging index...") + regenerate_staging_index() + + # Detect pages in live directories with status: pending (manual promotion without cleanup) + leaked: list[Path] = [] + for page in iter_live_pages(): + if str(page.frontmatter.get("status", "")) == "pending": + leaked.append(page.path) + + if leaked: + print("\n[warn] live pages still marked status: pending — fix manually:") + for p in leaked: + print(f" {p.relative_to(WIKI_DIR)}") + + pending = list_pending() + print(f"\n{len(pending)} pending item(s) in staging.") + return 0 + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser(description="Wiki staging pipeline") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--list", action="store_true", help="List pending items") + group.add_argument("--stats", action="store_true", help="Summary stats") + group.add_argument("--promote", metavar="PATH", help="Approve a pending page") + group.add_argument("--reject", metavar="PATH", help="Reject a pending page") + group.add_argument("--promote-all", action="store_true", help="Promote every pending page") + group.add_argument("--review", action="store_true", help="Interactive approval loop") + group.add_argument("--sync", action="store_true", help="Regenerate staging index & detect drift") + + parser.add_argument("--json", action="store_true", help="JSON output for --list") + parser.add_argument("--reason", default="", help="Rejection reason for --reject") + parser.add_argument("--dry-run", action="store_true", help="Show what would happen") + args = parser.parse_args() + + STAGING_DIR.mkdir(parents=True, exist_ok=True) + + if args.list: + return cmd_list(as_json=args.json) + if args.stats: + return cmd_stats() + if args.promote: + return cmd_promote(args.promote, args.dry_run) + if args.reject: + if not args.reason: + print("--reject requires --reason", file=sys.stderr) + return 2 + return cmd_reject(args.reject, args.reason, args.dry_run) + if args.promote_all: + return cmd_promote_all(args.dry_run) + if args.review: + return cmd_review() + if args.sync: + return cmd_sync() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/wiki-sync.sh b/scripts/wiki-sync.sh new file mode 100755 index 0000000..c7a12b7 --- /dev/null +++ b/scripts/wiki-sync.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +set -euo pipefail + +# wiki-sync.sh — Auto-commit, pull, resolve conflicts, push, reindex +# +# Designed to run via cron on both work and home machines. +# Safe to run frequently — no-ops when nothing has changed. +# +# Usage: +# wiki-sync.sh # Full sync (commit + pull + push + reindex) +# wiki-sync.sh --commit # Only commit local changes +# wiki-sync.sh --pull # Only pull remote changes +# wiki-sync.sh --push # Only push local commits +# wiki-sync.sh --reindex # Only rebuild qmd index +# wiki-sync.sh --status # Show sync status (no changes) + +WIKI_DIR="${WIKI_DIR:-${HOME}/projects/wiki}" +LOG_FILE="${WIKI_DIR}/scripts/.sync.log" +LOCK_FILE="/tmp/wiki-sync.lock" + +# --- Helpers --- + +log() { + local msg + msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" + echo "${msg}" | tee -a "${LOG_FILE}" +} + +die() { + log "ERROR: $*" + exit 1 +} + +acquire_lock() { + if [[ -f "${LOCK_FILE}" ]]; then + local pid + pid=$(cat "${LOCK_FILE}" 2>/dev/null || echo "") + if [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null; then + die "Another sync is running (pid ${pid})" + fi + rm -f "${LOCK_FILE}" + fi + echo $$ > "${LOCK_FILE}" + trap 'rm -f "${LOCK_FILE}"' EXIT +} + +# --- Operations --- + +do_commit() { + cd "${WIKI_DIR}" + + # Check for uncommitted changes (staged + unstaged + untracked) + if git diff --quiet && git diff --cached --quiet && [[ -z "$(git ls-files --others --exclude-standard)" ]]; then + return 0 + fi + + local hostname + hostname=$(hostname -s 2>/dev/null || echo "unknown") + + git add -A + git commit -m "$(cat </dev/null || true + + log "Committed local changes from ${hostname}" +} + +do_pull() { + cd "${WIKI_DIR}" + + # Fetch first to check if there's anything to pull + git fetch origin main 2>/dev/null || die "Failed to fetch from origin" + + local local_head remote_head + local_head=$(git rev-parse HEAD) + remote_head=$(git rev-parse origin/main) + + if [[ "${local_head}" == "${remote_head}" ]]; then + return 0 + fi + + # Pull with rebase to keep history linear + # If conflicts occur, resolve markdown files by keeping both sides + if ! git pull --rebase origin main 2>/dev/null; then + log "Conflicts detected, attempting auto-resolution..." + resolve_conflicts + fi + + log "Pulled remote changes" +} + +resolve_conflicts() { + cd "${WIKI_DIR}" + + local conflicted + conflicted=$(git diff --name-only --diff-filter=U 2>/dev/null || echo "") + + if [[ -z "${conflicted}" ]]; then + return 0 + fi + + while IFS= read -r file; do + if [[ "${file}" == *.md ]]; then + # For markdown: accept both sides (union merge) + # Remove conflict markers, keep all content + if [[ -f "${file}" ]]; then + sed -i.bak \ + -e '/^<<<<<<< /d' \ + -e '/^=======/d' \ + -e '/^>>>>>>> /d' \ + "${file}" + rm -f "${file}.bak" + git add "${file}" + log "Auto-resolved conflict in ${file} (kept both sides)" + fi + else + # For non-markdown: keep ours (local version wins) + git checkout --ours "${file}" 2>/dev/null + git add "${file}" + log "Auto-resolved conflict in ${file} (kept local)" + fi + done <<< "${conflicted}" + + # Continue the rebase + git rebase --continue 2>/dev/null || git commit --no-edit 2>/dev/null || true +} + +do_push() { + cd "${WIKI_DIR}" + + # Check if we have commits to push + local ahead + ahead=$(git rev-list --count origin/main..HEAD 2>/dev/null || echo "0") + + if [[ "${ahead}" -eq 0 ]]; then + return 0 + fi + + git push origin main 2>/dev/null || die "Failed to push to origin" + log "Pushed ${ahead} commit(s) to origin" +} + +do_reindex() { + if ! command -v qmd &>/dev/null; then + return 0 + fi + + # Check if qmd collection exists + if ! qmd collection list 2>/dev/null | grep -q "wiki"; then + qmd collection add "${WIKI_DIR}" --name wiki 2>/dev/null + fi + + qmd update 2>/dev/null + qmd embed 2>/dev/null + log "Rebuilt qmd index" +} + +do_status() { + cd "${WIKI_DIR}" + + echo "=== Wiki Sync Status ===" + echo "Directory: ${WIKI_DIR}" + echo "Branch: $(git branch --show-current)" + echo "Remote: $(git remote get-url origin)" + echo "" + + # Local changes + local changes + changes=$(git status --porcelain 2>/dev/null | wc -l | tr -d ' ') + echo "Uncommitted changes: ${changes}" + + # Ahead/behind + git fetch origin main 2>/dev/null + local ahead behind + ahead=$(git rev-list --count origin/main..HEAD 2>/dev/null || echo "0") + behind=$(git rev-list --count HEAD..origin/main 2>/dev/null || echo "0") + echo "Ahead of remote: ${ahead}" + echo "Behind remote: ${behind}" + + # qmd status + if command -v qmd &>/dev/null; then + echo "" + echo "qmd: installed" + qmd collection list 2>/dev/null | grep wiki || echo "qmd: wiki collection not found" + else + echo "" + echo "qmd: not installed" + fi + + # Last sync + if [[ -f "${LOG_FILE}" ]]; then + echo "" + echo "Last sync log entries:" + tail -5 "${LOG_FILE}" + fi +} + +# --- Main --- + +main() { + local mode="${1:-full}" + + mkdir -p "${WIKI_DIR}/scripts" + + # Status doesn't need a lock + if [[ "${mode}" == "--status" ]]; then + do_status + return 0 + fi + + acquire_lock + + case "${mode}" in + --commit) do_commit ;; + --pull) do_pull ;; + --push) do_push ;; + --reindex) do_reindex ;; + full|*) + do_commit + do_pull + do_push + do_reindex + ;; + esac +} + +main "$@" diff --git a/scripts/wiki_lib.py b/scripts/wiki_lib.py new file mode 100644 index 0000000..a37bc4c --- /dev/null +++ b/scripts/wiki_lib.py @@ -0,0 +1,211 @@ +"""Shared helpers for wiki maintenance scripts. + +Provides frontmatter parsing/serialization, WikiPage dataclass, and common +constants used by wiki-hygiene.py, wiki-staging.py, and wiki-harvest.py. +""" + +from __future__ import annotations + +import hashlib +import os +import re +from dataclasses import dataclass +from datetime import date, datetime, timezone +from pathlib import Path +from typing import Any + +# Wiki root — override via WIKI_DIR env var for tests / alternate installs +WIKI_DIR = Path(os.environ.get("WIKI_DIR", str(Path.home() / "projects" / "wiki"))) +INDEX_FILE = WIKI_DIR / "index.md" +STAGING_DIR = WIKI_DIR / "staging" +STAGING_INDEX = STAGING_DIR / "index.md" +ARCHIVE_DIR = WIKI_DIR / "archive" +ARCHIVE_INDEX = ARCHIVE_DIR / "index.md" +REPORTS_DIR = WIKI_DIR / "reports" +CONVERSATIONS_DIR = WIKI_DIR / "conversations" +HARVEST_STATE_FILE = WIKI_DIR / ".harvest-state.json" + +LIVE_CONTENT_DIRS = ["patterns", "decisions", "concepts", "environments"] + +FM_FENCE = "---\n" + + +@dataclass +class WikiPage: + path: Path + frontmatter: dict[str, Any] + fm_raw: str + body: str + fm_start: int + + +def today() -> date: + return datetime.now(timezone.utc).date() + + +def parse_date(value: Any) -> date | None: + if not value: + return None + if isinstance(value, date): + return value + s = str(value).strip() + try: + return datetime.strptime(s, "%Y-%m-%d").date() + except ValueError: + return None + + +def parse_page(path: Path) -> WikiPage | None: + """Parse a markdown page with YAML frontmatter. Returns None if no frontmatter.""" + try: + text = path.read_text() + except OSError: + return None + if not text.startswith(FM_FENCE): + return None + end = text.find("\n---\n", 4) + if end == -1: + return None + fm_raw = text[4:end] + body = text[end + 5 :] + fm = parse_yaml_lite(fm_raw) + return WikiPage(path=path, frontmatter=fm, fm_raw=fm_raw, body=body, fm_start=end + 5) + + +def parse_yaml_lite(text: str) -> dict[str, Any]: + """Parse a subset of YAML used in wiki frontmatter. + + Supports: + - key: value + - key: [a, b, c] + - key: + - a + - b + """ + result: dict[str, Any] = {} + lines = text.splitlines() + i = 0 + while i < len(lines): + line = lines[i] + if not line.strip() or line.lstrip().startswith("#"): + i += 1 + continue + m = re.match(r"^([\w_-]+):\s*(.*)$", line) + if not m: + i += 1 + continue + key, rest = m.group(1), m.group(2).strip() + if rest == "": + items: list[str] = [] + j = i + 1 + while j < len(lines) and re.match(r"^\s+-\s+", lines[j]): + items.append(re.sub(r"^\s+-\s+", "", lines[j]).strip()) + j += 1 + if items: + result[key] = items + i = j + continue + result[key] = "" + i += 1 + continue + if rest.startswith("[") and rest.endswith("]"): + inner = rest[1:-1].strip() + if inner: + result[key] = [x.strip().strip('"').strip("'") for x in inner.split(",")] + else: + result[key] = [] + i += 1 + continue + result[key] = rest.strip('"').strip("'") + i += 1 + return result + + +# Canonical frontmatter key order for serialization +PREFERRED_KEY_ORDER = [ + "title", "type", "confidence", + "status", "origin", + "last_compiled", "last_verified", + "staged_date", "staged_by", "target_path", "modifies", "compilation_notes", + "archived_date", "archived_reason", "original_path", + "sources", "related", +] + + +def serialize_frontmatter(fm: dict[str, Any]) -> str: + """Serialize a frontmatter dict back to YAML in the wiki's canonical style.""" + out_lines: list[str] = [] + seen: set[str] = set() + for key in PREFERRED_KEY_ORDER: + if key in fm: + out_lines.append(_format_fm_entry(key, fm[key])) + seen.add(key) + for key in sorted(fm.keys()): + if key in seen: + continue + out_lines.append(_format_fm_entry(key, fm[key])) + return "\n".join(out_lines) + + +def _format_fm_entry(key: str, value: Any) -> str: + if isinstance(value, list): + if not value: + return f"{key}: []" + lines = [f"{key}:"] + for item in value: + lines.append(f" - {item}") + return "\n".join(lines) + return f"{key}: {value}" + + +def write_page(page: WikiPage, new_fm: dict[str, Any] | None = None, new_body: str | None = None) -> None: + fm = new_fm if new_fm is not None else page.frontmatter + body = new_body if new_body is not None else page.body + fm_yaml = serialize_frontmatter(fm) + text = f"---\n{fm_yaml}\n---\n{body}" + page.path.write_text(text) + + +def iter_live_pages() -> list[WikiPage]: + pages: list[WikiPage] = [] + for sub in LIVE_CONTENT_DIRS: + for md in sorted((WIKI_DIR / sub).glob("*.md")): + page = parse_page(md) + if page: + pages.append(page) + return pages + + +def iter_staging_pages() -> list[WikiPage]: + pages: list[WikiPage] = [] + if not STAGING_DIR.exists(): + return pages + for sub in LIVE_CONTENT_DIRS: + d = STAGING_DIR / sub + if not d.exists(): + continue + for md in sorted(d.glob("*.md")): + page = parse_page(md) + if page: + pages.append(page) + return pages + + +def iter_archived_pages() -> list[WikiPage]: + pages: list[WikiPage] = [] + if not ARCHIVE_DIR.exists(): + return pages + for sub in LIVE_CONTENT_DIRS: + d = ARCHIVE_DIR / sub + if not d.exists(): + continue + for md in sorted(d.glob("*.md")): + page = parse_page(md) + if page: + pages.append(page) + return pages + + +def page_content_hash(page: WikiPage) -> str: + """Hash of page body only (excludes frontmatter) so mechanical frontmatter fixes don't churn the hash.""" + return "sha256:" + hashlib.sha256(page.body.strip().encode("utf-8")).hexdigest() diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..3ac9c20 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,107 @@ +# Wiki Pipeline Test Suite + +Pytest-based test suite covering all 11 scripts in `scripts/`. Runs on both +macOS and Linux/WSL, uses only the Python standard library + pytest. + +## Running + +```bash +# Full suite (from wiki root) +bash tests/run.sh + +# Single test file +bash tests/run.sh test_wiki_lib.py + +# Single test class or function +bash tests/run.sh test_wiki_hygiene.py::TestArchiveRestore +bash tests/run.sh test_wiki_hygiene.py::TestArchiveRestore::test_restore_reverses_archive + +# Pattern matching +bash tests/run.sh -k "archive" + +# Verbose +bash tests/run.sh -v + +# Stop on first failure +bash tests/run.sh -x + +# Or invoke pytest directly from the tests dir +cd tests && python3 -m pytest -v +``` + +## What's tested + +| File | Coverage | +|------|----------| +| `test_wiki_lib.py` | YAML parser, frontmatter round-trip, page iterators, date parsing, content hashing, WIKI_DIR env override | +| `test_wiki_hygiene.py` | Backfill, confidence decay math, frontmatter repair, archive/restore round-trip, orphan detection, broken-xref fuzzy matching, index drift, empty stubs, conversation refresh signals, auto-restore, staging/archive sync, state drift, hygiene state file, full quick-run idempotency | +| `test_wiki_staging.py` | List, promote, reject, promote-with-modifies, dry-run, staging index regeneration, path resolution | +| `test_wiki_harvest.py` | URL classification (harvest/check/skip), private IP detection, URL extraction + filtering, filename derivation, content validation, state management, raw file writing, dry-run CLI smoke test | +| `test_conversation_pipeline.py` | CLI smoke tests for extract-sessions, summarize-conversations, update-conversation-index; dry-run behavior; help flags; integration test with fake conversation files | +| `test_shell_scripts.py` | wiki-maintain.sh / mine-conversations.sh / wiki-sync.sh: help, dry-run, mutex flags, bash syntax check, strict-mode check, shebang check, py_compile for all .py scripts | + +## How it works + +**Isolation**: Every test runs against a disposable `tmp_wiki` fixture +(pytest `tmp_path`). The fixture sets the `WIKI_DIR` environment variable +so all scripts resolve paths against the tmp directory instead of the real +wiki. No test ever touches `~/projects/wiki`. + +**Hyphenated filenames**: Scripts like `wiki-harvest.py` use hyphens, which +Python's `import` can't handle directly. `conftest.py` has a +`_load_script_module` helper that loads a script file by path and exposes +it as a module object. + +**Clean module state**: Each test that loads a module clears any cached +import first, so `WIKI_DIR` env overrides take effect correctly between +tests. + +**Subprocess tests** (for CLI smoke tests): `conftest.py` provides a +`run_script` fixture that invokes a script via `python3` or `bash` with +`WIKI_DIR` set to the tmp wiki. Uses `subprocess.run` with `capture_output` +and a timeout. + +## Cross-platform + +- `#!/usr/bin/env bash` shebangs (tested explicitly) +- `set -euo pipefail` in all shell scripts (tested explicitly) +- `bash -n` syntax check on all shell scripts +- `py_compile` on all Python scripts +- Uses `pathlib` everywhere — no hardcoded path separators +- Uses the Python stdlib only (except pytest itself) + +## Requirements + +- Python 3.11+ +- `pytest` — install with `pip install --user pytest` or your distro's package manager +- `bash` (any version — scripts use only portable features) + +The tests do NOT require: +- `claude` CLI (mocked / skipped) +- `trafilatura` or `crawl4ai` (only dry-run / classification paths tested) +- `qmd` (reindex phase is skipped in tests) +- Network access +- The real `~/projects/wiki` or `~/.claude/projects` directories + +## Speed + +Full suite runs in **~1 second** on a modern laptop. All tests are isolated +and independent so they can run in any order and in parallel. + +## What's NOT tested + +- **Real LLM calls** (`claude -p`): too expensive, non-deterministic. + Tested: CLI parsing, dry-run paths, mocked error handling. +- **Real web fetches** (trafilatura/crawl4ai): too slow, non-deterministic. + Tested: URL classification, filter logic, fetch-result validation. +- **Real git operations** (wiki-sync.sh): requires a git repo fixture. + Tested: script loads, handles non-git dir gracefully, --status exits clean. +- **Real qmd indexing**: tested elsewhere via `qmd collection list` in the + setup verification step. +- **Real Claude Code session JSONL parsing** with actual sessions: would + require fixture JSONL files. Tested: CLI parsing, empty-dir behavior, + `CLAUDE_PROJECTS_DIR` env override. + +These are smoke-tested end-to-end via the integration tests in +`test_conversation_pipeline.py` and the dry-run paths in +`test_shell_scripts.py::TestWikiMaintainSh`. diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5f7223a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,300 @@ +"""Shared test fixtures for the wiki pipeline test suite. + +All tests run against a disposable `tmp_wiki` directory — no test ever +touches the real ~/projects/wiki. Cross-platform: uses pathlib, no +platform-specific paths, and runs on both macOS and Linux/WSL. +""" + +from __future__ import annotations + +import importlib +import importlib.util +import json +import os +import sys +from pathlib import Path +from typing import Any + +import pytest + +SCRIPTS_DIR = Path(__file__).resolve().parent.parent / "scripts" + + +# --------------------------------------------------------------------------- +# Module loading helpers +# --------------------------------------------------------------------------- +# +# The wiki scripts use hyphenated filenames (wiki-hygiene.py etc.) which +# can't be imported via normal `import` syntax. These helpers load a script +# file as a module object so tests can exercise its functions directly. + + +def _load_script_module(name: str, path: Path) -> Any: + """Load a Python script file as a module. Clears any cached version first.""" + # Clear cached imports so WIKI_DIR env changes take effect between tests + for key in list(sys.modules): + if key in (name, "wiki_lib"): + del sys.modules[key] + + # Make sure scripts/ is on sys.path so intra-script imports (wiki_lib) work + scripts_str = str(SCRIPTS_DIR) + if scripts_str not in sys.path: + sys.path.insert(0, scripts_str) + + spec = importlib.util.spec_from_file_location(name, path) + assert spec is not None and spec.loader is not None + mod = importlib.util.module_from_spec(spec) + sys.modules[name] = mod + spec.loader.exec_module(mod) + return mod + + +# --------------------------------------------------------------------------- +# tmp_wiki fixture — builds a realistic wiki tree under a tmp path +# --------------------------------------------------------------------------- + + +@pytest.fixture +def tmp_wiki(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Set up a disposable wiki tree with all the directories the scripts expect. + + Sets the WIKI_DIR environment variable so all imported modules resolve + paths against this tmp directory. + """ + wiki = tmp_path / "wiki" + wiki.mkdir() + + # Create the directory tree + for sub in ["patterns", "decisions", "concepts", "environments"]: + (wiki / sub).mkdir() + (wiki / "staging" / sub).mkdir(parents=True) + (wiki / "archive" / sub).mkdir(parents=True) + (wiki / "raw" / "harvested").mkdir(parents=True) + (wiki / "conversations").mkdir() + (wiki / "reports").mkdir() + + # Create minimal index.md + (wiki / "index.md").write_text( + "# Wiki Index\n\n" + "## Patterns\n\n" + "## Decisions\n\n" + "## Concepts\n\n" + "## Environments\n\n" + ) + + # Empty state files + (wiki / ".harvest-state.json").write_text(json.dumps({ + "harvested_urls": {}, + "skipped_urls": {}, + "failed_urls": {}, + "rejected_urls": {}, + "last_run": None, + })) + + # Point all scripts at this tmp wiki + monkeypatch.setenv("WIKI_DIR", str(wiki)) + + return wiki + + +# --------------------------------------------------------------------------- +# Sample page factories +# --------------------------------------------------------------------------- + + +def make_page( + wiki: Path, + rel_path: str, + *, + title: str | None = None, + ptype: str | None = None, + confidence: str = "high", + last_compiled: str = "2026-04-01", + last_verified: str = "2026-04-01", + origin: str = "manual", + sources: list[str] | None = None, + related: list[str] | None = None, + body: str = "# Content\n\nA substantive page with real content so it is not a stub.\n", + extra_fm: dict[str, Any] | None = None, +) -> Path: + """Write a well-formed wiki page with all required frontmatter fields.""" + if sources is None: + sources = [] + if related is None: + related = [] + """Write a page to the tmp wiki and return its path.""" + path = wiki / rel_path + path.parent.mkdir(parents=True, exist_ok=True) + + if title is None: + title = path.stem.replace("-", " ").title() + if ptype is None: + ptype = path.parent.name.rstrip("s") + + fm_lines = [ + "---", + f"title: {title}", + f"type: {ptype}", + f"confidence: {confidence}", + f"origin: {origin}", + f"last_compiled: {last_compiled}", + f"last_verified: {last_verified}", + ] + if sources is not None: + if sources: + fm_lines.append("sources:") + fm_lines.extend(f" - {s}" for s in sources) + else: + fm_lines.append("sources: []") + if related is not None: + if related: + fm_lines.append("related:") + fm_lines.extend(f" - {r}" for r in related) + else: + fm_lines.append("related: []") + if extra_fm: + for k, v in extra_fm.items(): + if isinstance(v, list): + if v: + fm_lines.append(f"{k}:") + fm_lines.extend(f" - {item}" for item in v) + else: + fm_lines.append(f"{k}: []") + else: + fm_lines.append(f"{k}: {v}") + fm_lines.append("---") + + path.write_text("\n".join(fm_lines) + "\n" + body) + return path + + +def make_conversation( + wiki: Path, + project: str, + filename: str, + *, + date: str = "2026-04-10", + status: str = "summarized", + messages: int = 100, + related: list[str] | None = None, + body: str = "## Summary\n\nTest conversation summary.\n", +) -> Path: + """Write a conversation file to the tmp wiki.""" + proj_dir = wiki / "conversations" / project + proj_dir.mkdir(parents=True, exist_ok=True) + path = proj_dir / filename + + fm_lines = [ + "---", + f"title: Test Conversation {filename}", + "type: conversation", + f"project: {project}", + f"date: {date}", + f"status: {status}", + f"messages: {messages}", + ] + if related: + fm_lines.append("related:") + fm_lines.extend(f" - {r}" for r in related) + fm_lines.append("---") + + path.write_text("\n".join(fm_lines) + "\n" + body) + return path + + +def make_staging_page( + wiki: Path, + rel_under_staging: str, + *, + title: str = "Pending Page", + ptype: str = "pattern", + staged_by: str = "wiki-harvest", + staged_date: str = "2026-04-10", + modifies: str | None = None, + target_path: str | None = None, + body: str = "# Pending\n\nStaged content body.\n", +) -> Path: + path = wiki / "staging" / rel_under_staging + path.parent.mkdir(parents=True, exist_ok=True) + + if target_path is None: + target_path = rel_under_staging + + fm_lines = [ + "---", + f"title: {title}", + f"type: {ptype}", + "confidence: medium", + "origin: automated", + "status: pending", + f"staged_date: {staged_date}", + f"staged_by: {staged_by}", + f"target_path: {target_path}", + ] + if modifies: + fm_lines.append(f"modifies: {modifies}") + fm_lines.append("compilation_notes: test note") + fm_lines.append("last_verified: 2026-04-10") + fm_lines.append("---") + + path.write_text("\n".join(fm_lines) + "\n" + body) + return path + + +# --------------------------------------------------------------------------- +# Module fixtures — each loads the corresponding script as a module +# --------------------------------------------------------------------------- + + +@pytest.fixture +def wiki_lib(tmp_wiki: Path) -> Any: + """Load wiki_lib fresh against the tmp_wiki directory.""" + return _load_script_module("wiki_lib", SCRIPTS_DIR / "wiki_lib.py") + + +@pytest.fixture +def wiki_hygiene(tmp_wiki: Path) -> Any: + """Load wiki-hygiene.py fresh. wiki_lib must be loaded first for its imports.""" + _load_script_module("wiki_lib", SCRIPTS_DIR / "wiki_lib.py") + return _load_script_module("wiki_hygiene", SCRIPTS_DIR / "wiki-hygiene.py") + + +@pytest.fixture +def wiki_staging(tmp_wiki: Path) -> Any: + _load_script_module("wiki_lib", SCRIPTS_DIR / "wiki_lib.py") + return _load_script_module("wiki_staging", SCRIPTS_DIR / "wiki-staging.py") + + +@pytest.fixture +def wiki_harvest(tmp_wiki: Path) -> Any: + _load_script_module("wiki_lib", SCRIPTS_DIR / "wiki_lib.py") + return _load_script_module("wiki_harvest", SCRIPTS_DIR / "wiki-harvest.py") + + +# --------------------------------------------------------------------------- +# Subprocess helper — runs a script as if from the CLI, with WIKI_DIR set +# --------------------------------------------------------------------------- + + +@pytest.fixture +def run_script(tmp_wiki: Path): + """Return a function that runs a script via subprocess with WIKI_DIR set.""" + import subprocess + + def _run(script_rel: str, *args: str, timeout: int = 60) -> subprocess.CompletedProcess: + script = SCRIPTS_DIR / script_rel + if script.suffix == ".py": + cmd = ["python3", str(script), *args] + else: + cmd = ["bash", str(script), *args] + env = os.environ.copy() + env["WIKI_DIR"] = str(tmp_wiki) + return subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + + return _run diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 0000000..274a37d --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,9 @@ +[pytest] +testpaths = . +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -ra --strict-markers --tb=short +markers = + slow: tests that take more than 1 second + network: tests that hit the network (skipped by default) diff --git a/tests/run.sh b/tests/run.sh new file mode 100755 index 0000000..0d0e6d2 --- /dev/null +++ b/tests/run.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +# run.sh — Convenience wrapper for running the wiki pipeline test suite. +# +# Usage: +# bash tests/run.sh # Run the full suite +# bash tests/run.sh -v # Verbose output +# bash tests/run.sh test_wiki_lib # Run one file +# bash tests/run.sh -k "parse" # Run tests matching a pattern +# +# All arguments are passed through to pytest. + +TESTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${TESTS_DIR}" + +# Verify pytest is available +if ! python3 -c "import pytest" 2>/dev/null; then + echo "pytest not installed. Install with: pip install --user pytest" + exit 2 +fi + +# Clear any previous test artifacts +rm -rf .pytest_cache 2>/dev/null || true + +# Default args: quiet with colored output +if [[ $# -eq 0 ]]; then + exec python3 -m pytest --tb=short +else + exec python3 -m pytest "$@" +fi diff --git a/tests/test_conversation_pipeline.py b/tests/test_conversation_pipeline.py new file mode 100644 index 0000000..e15cc1e --- /dev/null +++ b/tests/test_conversation_pipeline.py @@ -0,0 +1,121 @@ +"""Smoke + integration tests for the conversation mining pipeline. + +These scripts interact with external systems (Claude Code sessions dir, +claude CLI), so tests focus on CLI parsing, dry-run behavior, and error +handling rather than exercising the full extraction/summarization path. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + + +# --------------------------------------------------------------------------- +# extract-sessions.py +# --------------------------------------------------------------------------- + + +class TestExtractSessions: + def test_help_exits_clean(self, run_script) -> None: + result = run_script("extract-sessions.py", "--help") + assert result.returncode == 0 + assert "--project" in result.stdout + assert "--dry-run" in result.stdout + + def test_dry_run_with_empty_sessions_dir( + self, run_script, tmp_wiki: Path, tmp_path: Path, monkeypatch + ) -> None: + # Point CLAUDE_PROJECTS_DIR at an empty tmp dir via env (not currently + # supported — script reads ~/.claude/projects directly). Instead, use + # --project with a code that has no sessions to verify clean exit. + result = run_script("extract-sessions.py", "--dry-run", "--project", "nonexistent") + assert result.returncode == 0 + + def test_rejects_unknown_flag(self, run_script) -> None: + result = run_script("extract-sessions.py", "--bogus-flag") + assert result.returncode != 0 + assert "error" in result.stderr.lower() or "unrecognized" in result.stderr.lower() + + +# --------------------------------------------------------------------------- +# summarize-conversations.py +# --------------------------------------------------------------------------- + + +class TestSummarizeConversations: + def test_help_exits_clean(self, run_script) -> None: + result = run_script("summarize-conversations.py", "--help") + assert result.returncode == 0 + assert "--claude" in result.stdout + assert "--dry-run" in result.stdout + assert "--project" in result.stdout + + def test_dry_run_empty_conversations( + self, run_script, tmp_wiki: Path + ) -> None: + result = run_script("summarize-conversations.py", "--claude", "--dry-run") + assert result.returncode == 0 + + def test_dry_run_with_extracted_conversation( + self, run_script, tmp_wiki: Path + ) -> None: + from conftest import make_conversation + + make_conversation( + tmp_wiki, + "general", + "2026-04-10-abc.md", + status="extracted", # Not yet summarized + messages=50, + ) + result = run_script("summarize-conversations.py", "--claude", "--dry-run") + assert result.returncode == 0 + # Should mention the file or show it would be processed + assert "2026-04-10-abc.md" in result.stdout or "1 conversation" in result.stdout + + +# --------------------------------------------------------------------------- +# update-conversation-index.py +# --------------------------------------------------------------------------- + + +class TestUpdateConversationIndex: + def test_help_exits_clean(self, run_script) -> None: + result = run_script("update-conversation-index.py", "--help") + assert result.returncode == 0 + + def test_runs_on_empty_conversations_dir( + self, run_script, tmp_wiki: Path + ) -> None: + result = run_script("update-conversation-index.py") + # Should not crash even with no conversations + assert result.returncode == 0 + + def test_builds_index_from_conversations( + self, run_script, tmp_wiki: Path + ) -> None: + from conftest import make_conversation + + make_conversation( + tmp_wiki, + "general", + "2026-04-10-one.md", + status="summarized", + ) + make_conversation( + tmp_wiki, + "general", + "2026-04-11-two.md", + status="summarized", + ) + result = run_script("update-conversation-index.py") + assert result.returncode == 0 + + idx = tmp_wiki / "conversations" / "index.md" + assert idx.exists() + text = idx.read_text() + assert "2026-04-10-one.md" in text or "one.md" in text + assert "2026-04-11-two.md" in text or "two.md" in text diff --git a/tests/test_shell_scripts.py b/tests/test_shell_scripts.py new file mode 100644 index 0000000..21b8ba8 --- /dev/null +++ b/tests/test_shell_scripts.py @@ -0,0 +1,209 @@ +"""Smoke tests for the bash scripts. + +Bash scripts are harder to unit-test in isolation — these tests verify +CLI parsing, help text, and dry-run/safe flags work correctly and that +scripts exit cleanly in all the no-op paths. + +Cross-platform note: tests invoke scripts via `bash` explicitly, so they +work on both macOS (default /bin/bash) and Linux/WSL. They avoid anything +that requires external state (network, git, LLM). +""" + +from __future__ import annotations + +import os +import subprocess +from pathlib import Path +from typing import Any + +import pytest + +from conftest import make_conversation, make_page, make_staging_page + + +# --------------------------------------------------------------------------- +# wiki-maintain.sh +# --------------------------------------------------------------------------- + + +class TestWikiMaintainSh: + def test_help_flag(self, run_script) -> None: + result = run_script("wiki-maintain.sh", "--help") + assert result.returncode == 0 + assert "Usage:" in result.stdout or "usage:" in result.stdout.lower() + assert "--full" in result.stdout + assert "--harvest-only" in result.stdout + assert "--hygiene-only" in result.stdout + + def test_rejects_unknown_flag(self, run_script) -> None: + result = run_script("wiki-maintain.sh", "--bogus") + assert result.returncode != 0 + assert "Unknown option" in result.stderr + + def test_harvest_only_and_hygiene_only_conflict(self, run_script) -> None: + result = run_script( + "wiki-maintain.sh", "--harvest-only", "--hygiene-only" + ) + assert result.returncode != 0 + assert "mutually exclusive" in result.stderr + + def test_hygiene_only_dry_run_completes( + self, run_script, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/one.md") + result = run_script( + "wiki-maintain.sh", "--hygiene-only", "--dry-run", "--no-reindex" + ) + assert result.returncode == 0 + assert "Phase 2: Hygiene checks" in result.stdout + assert "finished" in result.stdout + + def test_phase_1_skipped_in_hygiene_only( + self, run_script, tmp_wiki: Path + ) -> None: + result = run_script( + "wiki-maintain.sh", "--hygiene-only", "--dry-run", "--no-reindex" + ) + assert result.returncode == 0 + assert "Phase 1: URL harvesting (skipped)" in result.stdout + + def test_phase_3_skipped_in_dry_run( + self, run_script, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/one.md") + result = run_script( + "wiki-maintain.sh", "--hygiene-only", "--dry-run" + ) + assert "Phase 3: qmd reindex (skipped)" in result.stdout + + def test_harvest_only_dry_run_completes( + self, run_script, tmp_wiki: Path + ) -> None: + # Add a summarized conversation so harvest has something to scan + make_conversation( + tmp_wiki, + "test", + "2026-04-10-test.md", + status="summarized", + body="See https://docs.python.org/3/library/os.html for details.\n", + ) + result = run_script( + "wiki-maintain.sh", + "--harvest-only", + "--dry-run", + "--no-compile", + "--no-reindex", + ) + assert result.returncode == 0 + assert "Phase 2: Hygiene checks (skipped)" in result.stdout + + +# --------------------------------------------------------------------------- +# wiki-sync.sh +# --------------------------------------------------------------------------- + + +class TestWikiSyncSh: + def test_status_on_non_git_dir_exits_cleanly(self, run_script) -> None: + """wiki-sync.sh --status against a non-git dir should fail gracefully. + + The tmp_wiki fixture is not a git repo, so git commands will fail. + The script should report the problem without hanging or leaking stack + traces. Any exit code is acceptable as long as it exits in reasonable + time and prints something useful to stdout/stderr. + """ + result = run_script("wiki-sync.sh", "--status", timeout=30) + # Should have produced some output and exited (not hung) + assert result.stdout or result.stderr + assert "Wiki Sync Status" in result.stdout or "not a git" in result.stderr.lower() + + +# --------------------------------------------------------------------------- +# mine-conversations.sh +# --------------------------------------------------------------------------- + + +class TestMineConversationsSh: + def test_extract_only_dry_run(self, run_script, tmp_wiki: Path) -> None: + """mine-conversations.sh --extract-only --dry-run should complete without LLM.""" + result = run_script( + "mine-conversations.sh", "--extract-only", "--dry-run", timeout=30 + ) + assert result.returncode == 0 + + def test_rejects_unknown_flag(self, run_script) -> None: + result = run_script("mine-conversations.sh", "--bogus-flag") + assert result.returncode != 0 + + +# --------------------------------------------------------------------------- +# Cross-platform sanity — scripts use portable bash syntax +# --------------------------------------------------------------------------- + + +class TestBashPortability: + """Verify scripts don't use bashisms that break on macOS /bin/bash 3.2.""" + + @pytest.mark.parametrize( + "script", + ["wiki-maintain.sh", "mine-conversations.sh", "wiki-sync.sh"], + ) + def test_shebang_is_env_bash(self, script: str) -> None: + """All shell scripts should use `#!/usr/bin/env bash` for portability.""" + path = Path(__file__).parent.parent / "scripts" / script + first_line = path.read_text().splitlines()[0] + assert first_line == "#!/usr/bin/env bash", ( + f"{script} has shebang {first_line!r}, expected #!/usr/bin/env bash" + ) + + @pytest.mark.parametrize( + "script", + ["wiki-maintain.sh", "mine-conversations.sh", "wiki-sync.sh"], + ) + def test_uses_strict_mode(self, script: str) -> None: + """All shell scripts should use `set -euo pipefail` for safe defaults.""" + path = Path(__file__).parent.parent / "scripts" / script + text = path.read_text() + assert "set -euo pipefail" in text, f"{script} missing strict mode" + + @pytest.mark.parametrize( + "script", + ["wiki-maintain.sh", "mine-conversations.sh", "wiki-sync.sh"], + ) + def test_bash_syntax_check(self, script: str) -> None: + """bash -n does a syntax-only parse and catches obvious errors.""" + path = Path(__file__).parent.parent / "scripts" / script + result = subprocess.run( + ["bash", "-n", str(path)], + capture_output=True, + text=True, + timeout=10, + ) + assert result.returncode == 0, f"{script} has bash syntax errors: {result.stderr}" + + +# --------------------------------------------------------------------------- +# Python script syntax check (smoke) +# --------------------------------------------------------------------------- + + +class TestPythonSyntax: + @pytest.mark.parametrize( + "script", + [ + "wiki_lib.py", + "wiki-harvest.py", + "wiki-staging.py", + "wiki-hygiene.py", + "extract-sessions.py", + "summarize-conversations.py", + "update-conversation-index.py", + ], + ) + def test_py_compile(self, script: str) -> None: + """py_compile catches syntax errors without executing the module.""" + import py_compile + + path = Path(__file__).parent.parent / "scripts" / script + # py_compile.compile raises on error; success returns the .pyc path + py_compile.compile(str(path), doraise=True) diff --git a/tests/test_wiki_harvest.py b/tests/test_wiki_harvest.py new file mode 100644 index 0000000..cc0a44e --- /dev/null +++ b/tests/test_wiki_harvest.py @@ -0,0 +1,323 @@ +"""Unit + integration tests for scripts/wiki-harvest.py.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any +from unittest.mock import patch + +import pytest + +from conftest import make_conversation + + +# --------------------------------------------------------------------------- +# URL classification +# --------------------------------------------------------------------------- + + +class TestClassifyUrl: + def test_regular_docs_site_harvest(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url("https://docs.python.org/3/library/os.html") == "harvest" + assert wiki_harvest.classify_url("https://blog.example.com/post") == "harvest" + + def test_github_issue_is_check(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url("https://github.com/foo/bar/issues/42") == "check" + + def test_github_pr_is_check(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url("https://github.com/foo/bar/pull/99") == "check" + + def test_stackoverflow_is_check(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url( + "https://stackoverflow.com/questions/12345/title" + ) == "check" + + def test_localhost_skip(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url("http://localhost:3000/path") == "skip" + assert wiki_harvest.classify_url("http://localhost/foo") == "skip" + + def test_private_ip_skip(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url("http://10.0.0.1/api") == "skip" + assert wiki_harvest.classify_url("http://172.30.224.1:8080/v1") == "skip" + assert wiki_harvest.classify_url("http://192.168.1.1/test") == "skip" + assert wiki_harvest.classify_url("http://127.0.0.1:8080/foo") == "skip" + + def test_local_and_internal_tld_skip(self, wiki_harvest: Any) -> None: + # `.local` and `.internal` are baked into SKIP_DOMAIN_PATTERNS + assert wiki_harvest.classify_url("https://router.local/admin") == "skip" + assert wiki_harvest.classify_url("https://service.internal/api") == "skip" + + def test_custom_skip_pattern_runtime(self, wiki_harvest: Any) -> None: + # Users can append their own patterns at runtime — verify the hook works + wiki_harvest.SKIP_DOMAIN_PATTERNS.append(r"\.mycompany\.com$") + try: + assert wiki_harvest.classify_url("https://git.mycompany.com/foo") == "skip" + assert wiki_harvest.classify_url("https://docs.mycompany.com/api") == "skip" + finally: + wiki_harvest.SKIP_DOMAIN_PATTERNS.pop() + + def test_atlassian_skip(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url("https://foo.atlassian.net/browse/BAR-1") == "skip" + + def test_slack_skip(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url("https://myteam.slack.com/archives/C123") == "skip" + + def test_github_repo_root_is_harvest(self, wiki_harvest: Any) -> None: + # Not an issue/pr/discussion — just a repo root, might contain docs + assert wiki_harvest.classify_url("https://github.com/foo/bar") == "harvest" + + def test_invalid_url_skip(self, wiki_harvest: Any) -> None: + assert wiki_harvest.classify_url("not a url") == "skip" + + +# --------------------------------------------------------------------------- +# Private IP detection +# --------------------------------------------------------------------------- + + +class TestPrivateIp: + def test_10_range(self, wiki_harvest: Any) -> None: + assert wiki_harvest._is_private_ip("10.0.0.1") is True + assert wiki_harvest._is_private_ip("10.255.255.255") is True + + def test_172_16_to_31_range(self, wiki_harvest: Any) -> None: + assert wiki_harvest._is_private_ip("172.16.0.1") is True + assert wiki_harvest._is_private_ip("172.31.255.255") is True + assert wiki_harvest._is_private_ip("172.15.0.1") is False + assert wiki_harvest._is_private_ip("172.32.0.1") is False + + def test_192_168_range(self, wiki_harvest: Any) -> None: + assert wiki_harvest._is_private_ip("192.168.0.1") is True + assert wiki_harvest._is_private_ip("192.167.0.1") is False + + def test_loopback(self, wiki_harvest: Any) -> None: + assert wiki_harvest._is_private_ip("127.0.0.1") is True + + def test_public_ip(self, wiki_harvest: Any) -> None: + assert wiki_harvest._is_private_ip("8.8.8.8") is False + + def test_hostname_not_ip(self, wiki_harvest: Any) -> None: + assert wiki_harvest._is_private_ip("example.com") is False + + +# --------------------------------------------------------------------------- +# URL extraction from files +# --------------------------------------------------------------------------- + + +class TestExtractUrls: + def test_finds_urls_in_markdown( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + path = make_conversation( + tmp_wiki, + "test", + "test.md", + body="See https://docs.python.org/3/library/os.html for details.\n" + "Also https://fastapi.tiangolo.com/tutorial/.\n", + ) + urls = wiki_harvest.extract_urls_from_file(path) + assert "https://docs.python.org/3/library/os.html" in urls + assert "https://fastapi.tiangolo.com/tutorial/" in urls + + def test_filters_asset_extensions( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + path = make_conversation( + tmp_wiki, + "test", + "assets.md", + body=( + "Real: https://example.com/docs/article.html\n" + "Image: https://example.com/logo.png\n" + "Script: https://cdn.example.com/lib.js\n" + "Font: https://fonts.example.com/face.woff2\n" + ), + ) + urls = wiki_harvest.extract_urls_from_file(path) + assert "https://example.com/docs/article.html" in urls + assert not any(u.endswith(".png") for u in urls) + assert not any(u.endswith(".js") for u in urls) + assert not any(u.endswith(".woff2") for u in urls) + + def test_strips_trailing_punctuation( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + path = make_conversation( + tmp_wiki, + "test", + "punct.md", + body="See https://example.com/foo. Also https://example.com/bar, and more.\n", + ) + urls = wiki_harvest.extract_urls_from_file(path) + assert "https://example.com/foo" in urls + assert "https://example.com/bar" in urls + + def test_deduplicates_within_file( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + path = make_conversation( + tmp_wiki, + "test", + "dup.md", + body=( + "First mention: https://example.com/same\n" + "Second mention: https://example.com/same\n" + ), + ) + urls = wiki_harvest.extract_urls_from_file(path) + assert urls.count("https://example.com/same") == 1 + + def test_returns_empty_for_missing_file( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + assert wiki_harvest.extract_urls_from_file(tmp_wiki / "nope.md") == [] + + def test_filters_short_urls( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + # Less than 20 chars are skipped + path = make_conversation( + tmp_wiki, + "test", + "short.md", + body="tiny http://a.b/ and https://example.com/long-path\n", + ) + urls = wiki_harvest.extract_urls_from_file(path) + assert "http://a.b/" not in urls + assert "https://example.com/long-path" in urls + + +# --------------------------------------------------------------------------- +# Raw filename derivation +# --------------------------------------------------------------------------- + + +class TestRawFilename: + def test_basic_url(self, wiki_harvest: Any) -> None: + name = wiki_harvest.raw_filename_for_url("https://docs.docker.com/build/multi-stage/") + assert name.startswith("docs-docker-com-") + assert "build" in name and "multi-stage" in name + assert name.endswith(".md") + + def test_strips_www(self, wiki_harvest: Any) -> None: + name = wiki_harvest.raw_filename_for_url("https://www.example.com/foo") + assert "www" not in name + + def test_root_url_uses_index(self, wiki_harvest: Any) -> None: + name = wiki_harvest.raw_filename_for_url("https://example.com/") + assert name == "example-com-index.md" + + def test_long_paths_truncated(self, wiki_harvest: Any) -> None: + long_url = "https://example.com/" + "a-very-long-segment/" * 20 + name = wiki_harvest.raw_filename_for_url(long_url) + assert len(name) < 200 + + +# --------------------------------------------------------------------------- +# Content validation +# --------------------------------------------------------------------------- + + +class TestValidateContent: + def test_accepts_clean_markdown(self, wiki_harvest: Any) -> None: + content = "# Title\n\n" + ("A clean paragraph of markdown content. " * 5) + assert wiki_harvest.validate_content(content) is True + + def test_rejects_empty(self, wiki_harvest: Any) -> None: + assert wiki_harvest.validate_content("") is False + + def test_rejects_too_short(self, wiki_harvest: Any) -> None: + assert wiki_harvest.validate_content("# Short") is False + + def test_rejects_html_leak(self, wiki_harvest: Any) -> None: + content = "# Title\n\n\n" + "content " * 30 + assert wiki_harvest.validate_content(content) is False + + def test_rejects_script_tag(self, wiki_harvest: Any) -> None: + content = "# Title\n\n\n" + "content " * 30 + assert wiki_harvest.validate_content(content) is False + + +# --------------------------------------------------------------------------- +# State management +# --------------------------------------------------------------------------- + + +class TestStateManagement: + def test_load_returns_defaults_when_file_empty( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + (tmp_wiki / ".harvest-state.json").write_text("{}") + state = wiki_harvest.load_state() + assert "harvested_urls" in state + assert "skipped_urls" in state + + def test_save_and_reload( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + state = wiki_harvest.load_state() + state["harvested_urls"]["https://example.com"] = { + "first_seen": "2026-04-12", + "seen_in": ["conversations/mc/foo.md"], + "raw_file": "raw/harvested/example.md", + "status": "raw", + "fetch_method": "trafilatura", + } + wiki_harvest.save_state(state) + + reloaded = wiki_harvest.load_state() + assert "https://example.com" in reloaded["harvested_urls"] + assert reloaded["last_run"] is not None + + +# --------------------------------------------------------------------------- +# Raw file writer +# --------------------------------------------------------------------------- + + +class TestWriteRawFile: + def test_writes_with_frontmatter( + self, wiki_harvest: Any, tmp_wiki: Path + ) -> None: + conv = make_conversation(tmp_wiki, "test", "source.md") + raw_path = wiki_harvest.write_raw_file( + "https://example.com/article", + "# Article\n\nClean content.\n", + "trafilatura", + conv, + ) + assert raw_path.exists() + text = raw_path.read_text() + assert "source_url: https://example.com/article" in text + assert "fetch_method: trafilatura" in text + assert "content_hash: sha256:" in text + assert "discovered_in: conversations/test/source.md" in text + + +# --------------------------------------------------------------------------- +# Dry-run CLI smoke test (no actual fetches) +# --------------------------------------------------------------------------- + + +class TestHarvestCli: + def test_dry_run_no_network_calls( + self, run_script, tmp_wiki: Path + ) -> None: + make_conversation( + tmp_wiki, + "test", + "test.md", + body="See https://docs.python.org/3/ and https://github.com/foo/bar/issues/1.\n", + ) + result = run_script("wiki-harvest.py", "--dry-run") + assert result.returncode == 0 + # Dry-run should classify without fetching + assert "would-harvest" in result.stdout or "Summary" in result.stdout + + def test_help_flag(self, run_script) -> None: + result = run_script("wiki-harvest.py", "--help") + assert result.returncode == 0 + assert "--dry-run" in result.stdout + assert "--no-compile" in result.stdout diff --git a/tests/test_wiki_hygiene.py b/tests/test_wiki_hygiene.py new file mode 100644 index 0000000..19e2216 --- /dev/null +++ b/tests/test_wiki_hygiene.py @@ -0,0 +1,616 @@ +"""Integration tests for scripts/wiki-hygiene.py. + +Uses the tmp_wiki fixture so tests never touch the real wiki. +""" + +from __future__ import annotations + +from datetime import date, timedelta +from pathlib import Path +from typing import Any + +import pytest + +from conftest import make_conversation, make_page, make_staging_page + + +# --------------------------------------------------------------------------- +# Backfill last_verified +# --------------------------------------------------------------------------- + + +class TestBackfill: + def test_sets_last_verified_from_last_compiled( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/foo.md", last_compiled="2026-01-15") + # Strip last_verified from the fixture-built file + text = path.read_text() + text = text.replace("last_verified: 2026-04-01\n", "") + path.write_text(text) + + changes = wiki_hygiene.backfill_last_verified() + assert len(changes) == 1 + assert changes[0][1] == "last_compiled" + + reparsed = wiki_hygiene.parse_page(path) + assert reparsed.frontmatter["last_verified"] == "2026-01-15" + + def test_skips_pages_already_verified( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/done.md", last_verified="2026-04-01") + changes = wiki_hygiene.backfill_last_verified() + assert changes == [] + + def test_dry_run_does_not_write( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/foo.md", last_compiled="2026-01-15") + text = path.read_text().replace("last_verified: 2026-04-01\n", "") + path.write_text(text) + + changes = wiki_hygiene.backfill_last_verified(dry_run=True) + assert len(changes) == 1 + + reparsed = wiki_hygiene.parse_page(path) + assert "last_verified" not in reparsed.frontmatter + + +# --------------------------------------------------------------------------- +# Confidence decay math +# --------------------------------------------------------------------------- + + +class TestConfidenceDecay: + def test_recent_page_unchanged(self, wiki_hygiene: Any) -> None: + recent = wiki_hygiene.today() - timedelta(days=30) + assert wiki_hygiene.expected_confidence("high", recent, False) == "high" + + def test_six_months_decays_high_to_medium(self, wiki_hygiene: Any) -> None: + old = wiki_hygiene.today() - timedelta(days=200) + assert wiki_hygiene.expected_confidence("high", old, False) == "medium" + + def test_nine_months_decays_medium_to_low(self, wiki_hygiene: Any) -> None: + old = wiki_hygiene.today() - timedelta(days=280) + assert wiki_hygiene.expected_confidence("medium", old, False) == "low" + + def test_twelve_months_decays_to_stale(self, wiki_hygiene: Any) -> None: + old = wiki_hygiene.today() - timedelta(days=400) + assert wiki_hygiene.expected_confidence("high", old, False) == "stale" + + def test_superseded_is_always_stale(self, wiki_hygiene: Any) -> None: + recent = wiki_hygiene.today() - timedelta(days=1) + assert wiki_hygiene.expected_confidence("high", recent, True) == "stale" + + def test_none_date_leaves_confidence_alone(self, wiki_hygiene: Any) -> None: + assert wiki_hygiene.expected_confidence("medium", None, False) == "medium" + + def test_bump_confidence_ladder(self, wiki_hygiene: Any) -> None: + assert wiki_hygiene.bump_confidence("stale") == "low" + assert wiki_hygiene.bump_confidence("low") == "medium" + assert wiki_hygiene.bump_confidence("medium") == "high" + assert wiki_hygiene.bump_confidence("high") == "high" + + +# --------------------------------------------------------------------------- +# Frontmatter repair +# --------------------------------------------------------------------------- + + +class TestFrontmatterRepair: + def test_adds_missing_confidence( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = tmp_wiki / "patterns" / "no-conf.md" + path.write_text( + "---\ntitle: No Confidence\ntype: pattern\n" + "last_compiled: 2026-04-01\nlast_verified: 2026-04-01\n---\n" + "# Body\n\nSubstantive content here for testing purposes.\n" + ) + changes = wiki_hygiene.repair_frontmatter() + assert any("confidence" in fields for _, fields in changes) + + reparsed = wiki_hygiene.parse_page(path) + assert reparsed.frontmatter["confidence"] == "medium" + + def test_fixes_invalid_confidence( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/bad-conf.md", confidence="wat") + changes = wiki_hygiene.repair_frontmatter() + assert any(p == path for p, _ in changes) + + reparsed = wiki_hygiene.parse_page(path) + assert reparsed.frontmatter["confidence"] == "medium" + + def test_leaves_valid_pages_alone( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/good.md") + changes = wiki_hygiene.repair_frontmatter() + assert changes == [] + + +# --------------------------------------------------------------------------- +# Archive and restore round-trip +# --------------------------------------------------------------------------- + + +class TestArchiveRestore: + def test_archive_moves_file_and_updates_frontmatter( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/doomed.md") + page = wiki_hygiene.parse_page(path) + + wiki_hygiene.archive_page(page, "test archive") + + assert not path.exists() + archived = tmp_wiki / "archive" / "patterns" / "doomed.md" + assert archived.exists() + + reparsed = wiki_hygiene.parse_page(archived) + assert reparsed.frontmatter["archived_reason"] == "test archive" + assert reparsed.frontmatter["original_path"] == "patterns/doomed.md" + assert reparsed.frontmatter["confidence"] == "stale" + + def test_restore_reverses_archive( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + original = make_page(tmp_wiki, "patterns/zombie.md") + page = wiki_hygiene.parse_page(original) + wiki_hygiene.archive_page(page, "test") + + archived = tmp_wiki / "archive" / "patterns" / "zombie.md" + archived_page = wiki_hygiene.parse_page(archived) + wiki_hygiene.restore_page(archived_page) + + assert original.exists() + assert not archived.exists() + + reparsed = wiki_hygiene.parse_page(original) + assert reparsed.frontmatter["confidence"] == "medium" + assert "archived_date" not in reparsed.frontmatter + assert "archived_reason" not in reparsed.frontmatter + assert "original_path" not in reparsed.frontmatter + + def test_archive_rejects_non_live_pages( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + # Page outside the live content dirs — should refuse to archive + weird = tmp_wiki / "raw" / "weird.md" + weird.parent.mkdir(parents=True, exist_ok=True) + weird.write_text("---\ntitle: Weird\n---\nBody\n") + page = wiki_hygiene.parse_page(weird) + result = wiki_hygiene.archive_page(page, "test") + assert result is None + + def test_archive_dry_run_does_not_move( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/safe.md") + page = wiki_hygiene.parse_page(path) + wiki_hygiene.archive_page(page, "test", dry_run=True) + assert path.exists() + assert not (tmp_wiki / "archive" / "patterns" / "safe.md").exists() + + +# --------------------------------------------------------------------------- +# Orphan detection +# --------------------------------------------------------------------------- + + +class TestOrphanDetection: + def test_finds_orphan_page(self, wiki_hygiene: Any, tmp_wiki: Path) -> None: + make_page(tmp_wiki, "patterns/lonely.md") + orphans = wiki_hygiene.find_orphan_pages() + assert len(orphans) == 1 + assert orphans[0].path.stem == "lonely" + + def test_page_referenced_in_index_is_not_orphan( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/linked.md") + idx = tmp_wiki / "index.md" + idx.write_text(idx.read_text() + "- [Linked](patterns/linked.md) — desc\n") + orphans = wiki_hygiene.find_orphan_pages() + assert not any(p.path.stem == "linked" for p in orphans) + + def test_page_referenced_in_related_is_not_orphan( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/referenced.md") + make_page( + tmp_wiki, + "patterns/referencer.md", + related=["patterns/referenced.md"], + ) + orphans = wiki_hygiene.find_orphan_pages() + stems = {p.path.stem for p in orphans} + assert "referenced" not in stems + + def test_fix_orphan_adds_to_index( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/orphan.md", title="Orphan Test") + page = wiki_hygiene.parse_page(path) + wiki_hygiene.fix_orphan_page(page) + idx_text = (tmp_wiki / "index.md").read_text() + assert "patterns/orphan.md" in idx_text + + +# --------------------------------------------------------------------------- +# Broken cross-references +# --------------------------------------------------------------------------- + + +class TestBrokenCrossRefs: + def test_detects_broken_link(self, wiki_hygiene: Any, tmp_wiki: Path) -> None: + make_page( + tmp_wiki, + "patterns/source.md", + body="See [nonexistent](patterns/does-not-exist.md) for details.\n", + ) + broken = wiki_hygiene.find_broken_cross_refs() + assert len(broken) == 1 + target, bad, suggested = broken[0] + assert bad == "patterns/does-not-exist.md" + + def test_fuzzy_match_finds_near_miss( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/health-endpoint.md") + make_page( + tmp_wiki, + "patterns/source.md", + body="See [H](patterns/health-endpoints.md) — typo.\n", + ) + broken = wiki_hygiene.find_broken_cross_refs() + assert len(broken) >= 1 + _, bad, suggested = broken[0] + assert suggested == "patterns/health-endpoint.md" + + def test_fix_broken_xref(self, wiki_hygiene: Any, tmp_wiki: Path) -> None: + make_page(tmp_wiki, "patterns/health-endpoint.md") + src = make_page( + tmp_wiki, + "patterns/source.md", + body="See [H](patterns/health-endpoints.md).\n", + ) + broken = wiki_hygiene.find_broken_cross_refs() + for target, bad, suggested in broken: + wiki_hygiene.fix_broken_cross_ref(target, bad, suggested) + text = src.read_text() + assert "patterns/health-endpoints.md" not in text + assert "patterns/health-endpoint.md" in text + + def test_archived_link_triggers_restore( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + # Page in archive, referenced by a live page + make_page( + tmp_wiki, + "archive/patterns/ghost.md", + confidence="stale", + extra_fm={ + "archived_date": "2026-01-01", + "archived_reason": "test", + "original_path": "patterns/ghost.md", + }, + ) + make_page( + tmp_wiki, + "patterns/caller.md", + body="See [ghost](patterns/ghost.md).\n", + ) + broken = wiki_hygiene.find_broken_cross_refs() + assert len(broken) >= 1 + for target, bad, suggested in broken: + if suggested and suggested.startswith("__RESTORE__"): + wiki_hygiene.fix_broken_cross_ref(target, bad, suggested) + # After restore, ghost should be live again + assert (tmp_wiki / "patterns" / "ghost.md").exists() + + +# --------------------------------------------------------------------------- +# Index drift +# --------------------------------------------------------------------------- + + +class TestIndexDrift: + def test_finds_page_missing_from_index( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/missing.md") + missing, stale = wiki_hygiene.find_index_drift() + assert "patterns/missing.md" in missing + assert stale == [] + + def test_finds_stale_index_entry( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + idx = tmp_wiki / "index.md" + idx.write_text( + idx.read_text() + + "- [Ghost](patterns/ghost.md) — page that no longer exists\n" + ) + missing, stale = wiki_hygiene.find_index_drift() + assert "patterns/ghost.md" in stale + + def test_fix_adds_missing_and_removes_stale( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/new.md") + idx = tmp_wiki / "index.md" + idx.write_text( + idx.read_text() + + "- [Gone](patterns/gone.md) — deleted page\n" + ) + missing, stale = wiki_hygiene.find_index_drift() + wiki_hygiene.fix_index_drift(missing, stale) + idx_text = idx.read_text() + assert "patterns/new.md" in idx_text + assert "patterns/gone.md" not in idx_text + + +# --------------------------------------------------------------------------- +# Empty stubs +# --------------------------------------------------------------------------- + + +class TestEmptyStubs: + def test_flags_small_body(self, wiki_hygiene: Any, tmp_wiki: Path) -> None: + make_page(tmp_wiki, "patterns/stub.md", body="# Stub\n\nShort.\n") + stubs = wiki_hygiene.find_empty_stubs() + assert len(stubs) == 1 + assert stubs[0].path.stem == "stub" + + def test_ignores_substantive_pages( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + body = "# Full\n\n" + ("This is substantive content. " * 20) + "\n" + make_page(tmp_wiki, "patterns/full.md", body=body) + stubs = wiki_hygiene.find_empty_stubs() + assert stubs == [] + + +# --------------------------------------------------------------------------- +# Conversation refresh signals +# --------------------------------------------------------------------------- + + +class TestConversationRefreshSignals: + def test_picks_up_related_link( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/hot.md", last_verified="2026-01-01") + make_conversation( + tmp_wiki, + "test", + "2026-04-11-abc.md", + date="2026-04-11", + related=["patterns/hot.md"], + ) + refs = wiki_hygiene.scan_conversation_references() + assert "patterns/hot.md" in refs + assert refs["patterns/hot.md"] == date(2026, 4, 11) + + def test_apply_refresh_updates_last_verified( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/hot.md", last_verified="2026-01-01") + make_conversation( + tmp_wiki, + "test", + "2026-04-11-abc.md", + date="2026-04-11", + related=["patterns/hot.md"], + ) + refs = wiki_hygiene.scan_conversation_references() + changes = wiki_hygiene.apply_refresh_signals(refs) + assert len(changes) == 1 + + reparsed = wiki_hygiene.parse_page(path) + assert reparsed.frontmatter["last_verified"] == "2026-04-11" + + def test_bumps_low_confidence_to_medium( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page( + tmp_wiki, + "patterns/reviving.md", + confidence="low", + last_verified="2026-01-01", + ) + make_conversation( + tmp_wiki, + "test", + "2026-04-11-ref.md", + date="2026-04-11", + related=["patterns/reviving.md"], + ) + refs = wiki_hygiene.scan_conversation_references() + wiki_hygiene.apply_refresh_signals(refs) + reparsed = wiki_hygiene.parse_page(path) + assert reparsed.frontmatter["confidence"] == "medium" + + +# --------------------------------------------------------------------------- +# Auto-restore +# --------------------------------------------------------------------------- + + +class TestAutoRestore: + def test_restores_page_referenced_in_conversation( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + # Archive a page + path = make_page(tmp_wiki, "patterns/returning.md") + page = wiki_hygiene.parse_page(path) + wiki_hygiene.archive_page(page, "aging out") + assert (tmp_wiki / "archive" / "patterns" / "returning.md").exists() + + # Reference it in a conversation + make_conversation( + tmp_wiki, + "test", + "2026-04-12-ref.md", + related=["patterns/returning.md"], + ) + + # Auto-restore + restored = wiki_hygiene.auto_restore_archived() + assert len(restored) == 1 + assert (tmp_wiki / "patterns" / "returning.md").exists() + assert not (tmp_wiki / "archive" / "patterns" / "returning.md").exists() + + +# --------------------------------------------------------------------------- +# Staging / archive index sync +# --------------------------------------------------------------------------- + + +class TestIndexSync: + def test_staging_sync_regenerates_index( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/pending.md") + changed = wiki_hygiene.sync_staging_index() + assert changed is True + text = (tmp_wiki / "staging" / "index.md").read_text() + assert "pending.md" in text + + def test_staging_sync_idempotent( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/pending.md") + wiki_hygiene.sync_staging_index() + changed_second = wiki_hygiene.sync_staging_index() + assert changed_second is False + + def test_archive_sync_regenerates_index( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page( + tmp_wiki, + "archive/patterns/old.md", + confidence="stale", + extra_fm={ + "archived_date": "2026-01-01", + "archived_reason": "test", + "original_path": "patterns/old.md", + }, + ) + changed = wiki_hygiene.sync_archive_index() + assert changed is True + text = (tmp_wiki / "archive" / "index.md").read_text() + assert "old" in text.lower() + + +# --------------------------------------------------------------------------- +# State drift detection +# --------------------------------------------------------------------------- + + +class TestStateDrift: + def test_detects_missing_raw_file( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + import json + state = { + "harvested_urls": { + "https://example.com": { + "raw_file": "raw/harvested/missing.md", + "wiki_pages": [], + } + } + } + (tmp_wiki / ".harvest-state.json").write_text(json.dumps(state)) + issues = wiki_hygiene.find_state_drift() + assert any("missing.md" in i for i in issues) + + def test_empty_state_has_no_drift( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + # Fixture already creates an empty .harvest-state.json + issues = wiki_hygiene.find_state_drift() + assert issues == [] + + +# --------------------------------------------------------------------------- +# Hygiene state file +# --------------------------------------------------------------------------- + + +class TestHygieneState: + def test_load_returns_defaults_when_missing( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + state = wiki_hygiene.load_hygiene_state() + assert state["last_quick_run"] is None + assert state["pages_checked"] == {} + + def test_save_and_reload( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + state = wiki_hygiene.load_hygiene_state() + state["last_quick_run"] = "2026-04-12T00:00:00Z" + wiki_hygiene.save_hygiene_state(state) + + reloaded = wiki_hygiene.load_hygiene_state() + assert reloaded["last_quick_run"] == "2026-04-12T00:00:00Z" + + def test_mark_page_checked_stores_hash( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/tracked.md") + page = wiki_hygiene.parse_page(path) + state = wiki_hygiene.load_hygiene_state() + wiki_hygiene.mark_page_checked(state, page, "quick") + entry = state["pages_checked"]["patterns/tracked.md"] + assert entry["content_hash"].startswith("sha256:") + assert "last_checked_quick" in entry + + def test_page_changed_since_detects_body_change( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/mutable.md", body="# One\n\nOne body.\n") + page = wiki_hygiene.parse_page(path) + state = wiki_hygiene.load_hygiene_state() + wiki_hygiene.mark_page_checked(state, page, "quick") + + assert not wiki_hygiene.page_changed_since(state, page, "quick") + + # Mutate the body + path.write_text(path.read_text().replace("One body", "Two body")) + new_page = wiki_hygiene.parse_page(path) + assert wiki_hygiene.page_changed_since(state, new_page, "quick") + + +# --------------------------------------------------------------------------- +# Full quick-hygiene run end-to-end (dry-run, idempotent) +# --------------------------------------------------------------------------- + + +class TestRunQuickHygiene: + def test_empty_wiki_produces_empty_report( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + report = wiki_hygiene.run_quick_hygiene(dry_run=True) + assert report.backfilled == [] + assert report.archived == [] + + def test_real_run_is_idempotent( + self, wiki_hygiene: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/one.md") + make_page(tmp_wiki, "patterns/two.md") + + report1 = wiki_hygiene.run_quick_hygiene() + # Second run should have 0 work + report2 = wiki_hygiene.run_quick_hygiene() + assert report2.backfilled == [] + assert report2.decayed == [] + assert report2.archived == [] + assert report2.frontmatter_fixes == [] diff --git a/tests/test_wiki_lib.py b/tests/test_wiki_lib.py new file mode 100644 index 0000000..aab2d45 --- /dev/null +++ b/tests/test_wiki_lib.py @@ -0,0 +1,314 @@ +"""Unit tests for scripts/wiki_lib.py — the shared frontmatter library.""" + +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import Any + +import pytest + +from conftest import make_page, make_staging_page + + +# --------------------------------------------------------------------------- +# parse_yaml_lite +# --------------------------------------------------------------------------- + + +class TestParseYamlLite: + def test_simple_key_value(self, wiki_lib: Any) -> None: + result = wiki_lib.parse_yaml_lite("title: Hello\ntype: pattern\n") + assert result == {"title": "Hello", "type": "pattern"} + + def test_quoted_values_are_stripped(self, wiki_lib: Any) -> None: + result = wiki_lib.parse_yaml_lite('title: "Hello"\nother: \'World\'\n') + assert result["title"] == "Hello" + assert result["other"] == "World" + + def test_inline_list(self, wiki_lib: Any) -> None: + result = wiki_lib.parse_yaml_lite("tags: [a, b, c]\n") + assert result["tags"] == ["a", "b", "c"] + + def test_empty_inline_list(self, wiki_lib: Any) -> None: + result = wiki_lib.parse_yaml_lite("sources: []\n") + assert result["sources"] == [] + + def test_block_list(self, wiki_lib: Any) -> None: + yaml = "related:\n - foo.md\n - bar.md\n - baz.md\n" + result = wiki_lib.parse_yaml_lite(yaml) + assert result["related"] == ["foo.md", "bar.md", "baz.md"] + + def test_mixed_keys(self, wiki_lib: Any) -> None: + yaml = ( + "title: Mixed\n" + "type: pattern\n" + "related:\n" + " - one.md\n" + " - two.md\n" + "confidence: high\n" + ) + result = wiki_lib.parse_yaml_lite(yaml) + assert result["title"] == "Mixed" + assert result["related"] == ["one.md", "two.md"] + assert result["confidence"] == "high" + + def test_empty_value(self, wiki_lib: Any) -> None: + result = wiki_lib.parse_yaml_lite("empty: \n") + assert result["empty"] == "" + + def test_comment_lines_ignored(self, wiki_lib: Any) -> None: + result = wiki_lib.parse_yaml_lite("# this is a comment\ntitle: X\n") + assert result == {"title": "X"} + + def test_blank_lines_ignored(self, wiki_lib: Any) -> None: + result = wiki_lib.parse_yaml_lite("\ntitle: X\n\ntype: pattern\n\n") + assert result == {"title": "X", "type": "pattern"} + + +# --------------------------------------------------------------------------- +# parse_page +# --------------------------------------------------------------------------- + + +class TestParsePage: + def test_parses_valid_page(self, wiki_lib: Any, tmp_wiki: Path) -> None: + path = make_page(tmp_wiki, "patterns/foo.md", title="Foo", confidence="high") + page = wiki_lib.parse_page(path) + assert page is not None + assert page.frontmatter["title"] == "Foo" + assert page.frontmatter["confidence"] == "high" + assert "# Content" in page.body + + def test_returns_none_without_frontmatter( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + path = tmp_wiki / "patterns" / "no-fm.md" + path.write_text("# Just a body\n\nNo frontmatter.\n") + assert wiki_lib.parse_page(path) is None + + def test_returns_none_for_missing_file(self, wiki_lib: Any, tmp_wiki: Path) -> None: + assert wiki_lib.parse_page(tmp_wiki / "nonexistent.md") is None + + def test_returns_none_for_truncated_frontmatter( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + path = tmp_wiki / "patterns" / "broken.md" + path.write_text("---\ntitle: Broken\n# never closed\n") + assert wiki_lib.parse_page(path) is None + + def test_preserves_body_exactly(self, wiki_lib: Any, tmp_wiki: Path) -> None: + body = "# Heading\n\nLine 1\nLine 2\n\n## Sub\n\nMore.\n" + path = make_page(tmp_wiki, "patterns/body.md", body=body) + page = wiki_lib.parse_page(path) + assert page.body == body + + +# --------------------------------------------------------------------------- +# serialize_frontmatter +# --------------------------------------------------------------------------- + + +class TestSerializeFrontmatter: + def test_preferred_key_order(self, wiki_lib: Any) -> None: + fm = { + "related": ["a.md"], + "sources": ["raw/x.md"], + "title": "T", + "confidence": "high", + "type": "pattern", + } + yaml = wiki_lib.serialize_frontmatter(fm) + lines = yaml.split("\n") + # title/type/confidence should come before sources/related + assert lines[0].startswith("title:") + assert lines[1].startswith("type:") + assert lines[2].startswith("confidence:") + assert "sources:" in yaml + assert "related:" in yaml + # sources must come before related (both are in PREFERRED_KEY_ORDER) + assert yaml.index("sources:") < yaml.index("related:") + + def test_list_formatted_as_block(self, wiki_lib: Any) -> None: + fm = {"title": "T", "related": ["one.md", "two.md"]} + yaml = wiki_lib.serialize_frontmatter(fm) + assert "related:\n - one.md\n - two.md" in yaml + + def test_empty_list(self, wiki_lib: Any) -> None: + fm = {"title": "T", "sources": []} + yaml = wiki_lib.serialize_frontmatter(fm) + assert "sources: []" in yaml + + def test_unknown_keys_appear_alphabetically_at_end(self, wiki_lib: Any) -> None: + fm = {"title": "T", "type": "pattern", "zoo": "z", "alpha": "a"} + yaml = wiki_lib.serialize_frontmatter(fm) + # alpha should come before zoo (alphabetical) + assert yaml.index("alpha:") < yaml.index("zoo:") + + +# --------------------------------------------------------------------------- +# Round-trip: parse_page → write_page → parse_page +# --------------------------------------------------------------------------- + + +class TestRoundTrip: + def test_round_trip_preserves_core_fields( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + path = make_page( + tmp_wiki, + "patterns/rt.md", + title="Round Trip", + sources=["raw/a.md", "raw/b.md"], + related=["patterns/other.md"], + ) + page1 = wiki_lib.parse_page(path) + wiki_lib.write_page(page1) + page2 = wiki_lib.parse_page(path) + assert page2.frontmatter["title"] == "Round Trip" + assert page2.frontmatter["sources"] == ["raw/a.md", "raw/b.md"] + assert page2.frontmatter["related"] == ["patterns/other.md"] + assert page2.body == page1.body + + def test_round_trip_preserves_mutation( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + path = make_page(tmp_wiki, "patterns/rt.md", confidence="high") + page = wiki_lib.parse_page(path) + page.frontmatter["confidence"] = "low" + wiki_lib.write_page(page) + page2 = wiki_lib.parse_page(path) + assert page2.frontmatter["confidence"] == "low" + + +# --------------------------------------------------------------------------- +# parse_date +# --------------------------------------------------------------------------- + + +class TestParseDate: + def test_iso_format(self, wiki_lib: Any) -> None: + assert wiki_lib.parse_date("2026-04-10") == date(2026, 4, 10) + + def test_empty_string_returns_none(self, wiki_lib: Any) -> None: + assert wiki_lib.parse_date("") is None + + def test_none_returns_none(self, wiki_lib: Any) -> None: + assert wiki_lib.parse_date(None) is None + + def test_invalid_format_returns_none(self, wiki_lib: Any) -> None: + assert wiki_lib.parse_date("not-a-date") is None + assert wiki_lib.parse_date("2026/04/10") is None + assert wiki_lib.parse_date("04-10-2026") is None + + def test_date_object_passthrough(self, wiki_lib: Any) -> None: + d = date(2026, 4, 10) + assert wiki_lib.parse_date(d) == d + + +# --------------------------------------------------------------------------- +# page_content_hash +# --------------------------------------------------------------------------- + + +class TestPageContentHash: + def test_deterministic(self, wiki_lib: Any, tmp_wiki: Path) -> None: + path = make_page(tmp_wiki, "patterns/h.md", body="# Same body\n\nLine.\n") + page = wiki_lib.parse_page(path) + h1 = wiki_lib.page_content_hash(page) + h2 = wiki_lib.page_content_hash(page) + assert h1 == h2 + assert h1.startswith("sha256:") + + def test_different_bodies_yield_different_hashes( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + p1 = make_page(tmp_wiki, "patterns/a.md", body="# A\n\nAlpha.\n") + p2 = make_page(tmp_wiki, "patterns/b.md", body="# B\n\nBeta.\n") + h1 = wiki_lib.page_content_hash(wiki_lib.parse_page(p1)) + h2 = wiki_lib.page_content_hash(wiki_lib.parse_page(p2)) + assert h1 != h2 + + def test_frontmatter_changes_dont_change_hash( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + """Hash is body-only so mechanical frontmatter fixes don't churn it.""" + path = make_page(tmp_wiki, "patterns/f.md", confidence="high") + page = wiki_lib.parse_page(path) + h1 = wiki_lib.page_content_hash(page) + + page.frontmatter["confidence"] = "medium" + wiki_lib.write_page(page) + page2 = wiki_lib.parse_page(path) + h2 = wiki_lib.page_content_hash(page2) + assert h1 == h2 + + +# --------------------------------------------------------------------------- +# Iterators +# --------------------------------------------------------------------------- + + +class TestIterators: + def test_iter_live_pages_finds_all_types( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/p1.md") + make_page(tmp_wiki, "patterns/p2.md") + make_page(tmp_wiki, "decisions/d1.md") + make_page(tmp_wiki, "concepts/c1.md") + make_page(tmp_wiki, "environments/e1.md") + pages = wiki_lib.iter_live_pages() + assert len(pages) == 5 + stems = {p.path.stem for p in pages} + assert stems == {"p1", "p2", "d1", "c1", "e1"} + + def test_iter_live_pages_empty_wiki( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + assert wiki_lib.iter_live_pages() == [] + + def test_iter_staging_pages(self, wiki_lib: Any, tmp_wiki: Path) -> None: + make_staging_page(tmp_wiki, "patterns/s1.md") + make_staging_page(tmp_wiki, "decisions/s2.md", ptype="decision") + pages = wiki_lib.iter_staging_pages() + assert len(pages) == 2 + assert all(p.frontmatter.get("status") == "pending" for p in pages) + + def test_iter_archived_pages(self, wiki_lib: Any, tmp_wiki: Path) -> None: + make_page( + tmp_wiki, + "archive/patterns/old.md", + confidence="stale", + extra_fm={ + "archived_date": "2026-01-01", + "archived_reason": "test", + "original_path": "patterns/old.md", + }, + ) + pages = wiki_lib.iter_archived_pages() + assert len(pages) == 1 + assert pages[0].frontmatter["archived_reason"] == "test" + + def test_iter_skips_malformed_pages( + self, wiki_lib: Any, tmp_wiki: Path + ) -> None: + make_page(tmp_wiki, "patterns/good.md") + (tmp_wiki / "patterns" / "no-fm.md").write_text("# Just a body\n") + pages = wiki_lib.iter_live_pages() + assert len(pages) == 1 + assert pages[0].path.stem == "good" + + +# --------------------------------------------------------------------------- +# WIKI_DIR env var override +# --------------------------------------------------------------------------- + + +class TestWikiDirEnvVar: + def test_honors_env_var(self, wiki_lib: Any, tmp_wiki: Path) -> None: + """The tmp_wiki fixture sets WIKI_DIR — verify wiki_lib picks it up.""" + assert wiki_lib.WIKI_DIR == tmp_wiki + assert wiki_lib.STAGING_DIR == tmp_wiki / "staging" + assert wiki_lib.ARCHIVE_DIR == tmp_wiki / "archive" + assert wiki_lib.INDEX_FILE == tmp_wiki / "index.md" diff --git a/tests/test_wiki_staging.py b/tests/test_wiki_staging.py new file mode 100644 index 0000000..82f2b73 --- /dev/null +++ b/tests/test_wiki_staging.py @@ -0,0 +1,267 @@ +"""Integration tests for scripts/wiki-staging.py.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest + +from conftest import make_page, make_staging_page + + +# --------------------------------------------------------------------------- +# List + page_summary +# --------------------------------------------------------------------------- + + +class TestListPending: + def test_empty_staging(self, wiki_staging: Any, tmp_wiki: Path) -> None: + assert wiki_staging.list_pending() == [] + + def test_finds_pages_in_all_type_subdirs( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/p.md", ptype="pattern") + make_staging_page(tmp_wiki, "decisions/d.md", ptype="decision") + make_staging_page(tmp_wiki, "concepts/c.md", ptype="concept") + pending = wiki_staging.list_pending() + assert len(pending) == 3 + + def test_skips_staging_index_md( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + (tmp_wiki / "staging" / "index.md").write_text( + "---\ntitle: Index\n---\n# staging index\n" + ) + make_staging_page(tmp_wiki, "patterns/real.md") + pending = wiki_staging.list_pending() + assert len(pending) == 1 + assert pending[0].path.stem == "real" + + def test_page_summary_populates_all_fields( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page( + tmp_wiki, + "patterns/sample.md", + title="Sample", + staged_by="wiki-harvest", + staged_date="2026-04-10", + target_path="patterns/sample.md", + ) + pending = wiki_staging.list_pending() + summary = wiki_staging.page_summary(pending[0]) + assert summary["title"] == "Sample" + assert summary["type"] == "pattern" + assert summary["staged_by"] == "wiki-harvest" + assert summary["target_path"] == "patterns/sample.md" + assert summary["modifies"] is None + + +# --------------------------------------------------------------------------- +# Promote +# --------------------------------------------------------------------------- + + +class TestPromote: + def test_moves_file_to_live( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/new.md", title="New Page") + page = wiki_staging.parse_page(tmp_wiki / "staging" / "patterns" / "new.md") + result = wiki_staging.promote(page) + assert result is not None + assert (tmp_wiki / "patterns" / "new.md").exists() + assert not (tmp_wiki / "staging" / "patterns" / "new.md").exists() + + def test_strips_staging_only_fields( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/clean.md") + page = wiki_staging.parse_page(tmp_wiki / "staging" / "patterns" / "clean.md") + wiki_staging.promote(page) + + promoted = wiki_staging.parse_page(tmp_wiki / "patterns" / "clean.md") + for field in ("status", "staged_date", "staged_by", "target_path", "compilation_notes"): + assert field not in promoted.frontmatter + + def test_preserves_origin_automated( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/auto.md") + page = wiki_staging.parse_page(tmp_wiki / "staging" / "patterns" / "auto.md") + wiki_staging.promote(page) + promoted = wiki_staging.parse_page(tmp_wiki / "patterns" / "auto.md") + assert promoted.frontmatter["origin"] == "automated" + + def test_updates_main_index( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/indexed.md", title="Indexed Page") + page = wiki_staging.parse_page(tmp_wiki / "staging" / "patterns" / "indexed.md") + wiki_staging.promote(page) + + idx = (tmp_wiki / "index.md").read_text() + assert "patterns/indexed.md" in idx + + def test_regenerates_staging_index( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/one.md") + make_staging_page(tmp_wiki, "patterns/two.md") + page = wiki_staging.parse_page(tmp_wiki / "staging" / "patterns" / "one.md") + wiki_staging.promote(page) + + idx = (tmp_wiki / "staging" / "index.md").read_text() + assert "two.md" in idx + assert "1 pending" in idx + + def test_dry_run_does_not_move( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/safe.md") + page = wiki_staging.parse_page(tmp_wiki / "staging" / "patterns" / "safe.md") + wiki_staging.promote(page, dry_run=True) + assert (tmp_wiki / "staging" / "patterns" / "safe.md").exists() + assert not (tmp_wiki / "patterns" / "safe.md").exists() + + +# --------------------------------------------------------------------------- +# Promote with modifies field +# --------------------------------------------------------------------------- + + +class TestPromoteUpdate: + def test_update_overwrites_existing_live_page( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + # Existing live page + make_page( + tmp_wiki, + "patterns/existing.md", + title="Old Title", + last_compiled="2026-01-01", + ) + # Staging update with `modifies` + make_staging_page( + tmp_wiki, + "patterns/existing.md", + title="New Title", + modifies="patterns/existing.md", + target_path="patterns/existing.md", + ) + page = wiki_staging.parse_page( + tmp_wiki / "staging" / "patterns" / "existing.md" + ) + wiki_staging.promote(page) + + live = wiki_staging.parse_page(tmp_wiki / "patterns" / "existing.md") + assert live.frontmatter["title"] == "New Title" + + +# --------------------------------------------------------------------------- +# Reject +# --------------------------------------------------------------------------- + + +class TestReject: + def test_deletes_file(self, wiki_staging: Any, tmp_wiki: Path) -> None: + path = make_staging_page(tmp_wiki, "patterns/bad.md") + page = wiki_staging.parse_page(path) + wiki_staging.reject(page, "duplicate") + assert not path.exists() + + def test_records_rejection_in_harvest_state( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + # Create a raw harvested file with a source_url + raw = tmp_wiki / "raw" / "harvested" / "example-com-test.md" + raw.parent.mkdir(parents=True, exist_ok=True) + raw.write_text( + "---\n" + "source_url: https://example.com/test\n" + "fetched_date: 2026-04-10\n" + "fetch_method: trafilatura\n" + "discovered_in: conversations/mc/test.md\n" + "content_hash: sha256:abc\n" + "---\n" + "# Example\n" + ) + + # Create a staging page that references it + make_staging_page(tmp_wiki, "patterns/reject-me.md") + staging_path = tmp_wiki / "staging" / "patterns" / "reject-me.md" + # Inject sources so reject() finds the harvest_source + page = wiki_staging.parse_page(staging_path) + page.frontmatter["sources"] = ["raw/harvested/example-com-test.md"] + wiki_staging.write_page(page) + + page = wiki_staging.parse_page(staging_path) + wiki_staging.reject(page, "test rejection") + + state = json.loads((tmp_wiki / ".harvest-state.json").read_text()) + assert "https://example.com/test" in state["rejected_urls"] + assert state["rejected_urls"]["https://example.com/test"]["reason"] == "test rejection" + + def test_reject_dry_run_keeps_file( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + path = make_staging_page(tmp_wiki, "patterns/kept.md") + page = wiki_staging.parse_page(path) + wiki_staging.reject(page, "test", dry_run=True) + assert path.exists() + + +# --------------------------------------------------------------------------- +# Staging index regeneration +# --------------------------------------------------------------------------- + + +class TestStagingIndexRegen: + def test_empty_index_shows_none( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + wiki_staging.regenerate_staging_index() + idx = (tmp_wiki / "staging" / "index.md").read_text() + assert "0 pending" in idx + assert "No pending items" in idx + + def test_lists_pending_items( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/a.md", title="A") + make_staging_page(tmp_wiki, "decisions/b.md", title="B", ptype="decision") + wiki_staging.regenerate_staging_index() + idx = (tmp_wiki / "staging" / "index.md").read_text() + assert "2 pending" in idx + assert "A" in idx and "B" in idx + + +# --------------------------------------------------------------------------- +# Path resolution +# --------------------------------------------------------------------------- + + +class TestResolvePage: + def test_resolves_staging_relative_path( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/foo.md") + page = wiki_staging.resolve_page("staging/patterns/foo.md") + assert page is not None + assert page.path.name == "foo.md" + + def test_returns_none_for_missing( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + assert wiki_staging.resolve_page("staging/patterns/does-not-exist.md") is None + + def test_resolves_bare_patterns_path_as_staging( + self, wiki_staging: Any, tmp_wiki: Path + ) -> None: + make_staging_page(tmp_wiki, "patterns/bare.md") + page = wiki_staging.resolve_page("patterns/bare.md") + assert page is not None + assert "staging" in str(page.path)