Initial commit — memex

A compounding LLM-maintained knowledge wiki.

Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's
mempalace, with an automation layer on top for conversation mining, URL
harvesting, human-in-the-loop staging, staleness decay, and hygiene.

Includes:
- 11 pipeline scripts (extract, summarize, index, harvest, stage,
  hygiene, maintain, sync, + shared library)
- Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE
- Example CLAUDE.md files (wiki schema + global instructions) tuned for
  the three-collection qmd setup
- 171-test pytest suite (cross-platform, runs in ~1.3s)
- MIT licensed
This commit is contained in:
Eric Turner
2026-04-12 21:16:02 -06:00
commit ee54a2f5d4
31 changed files with 10792 additions and 0 deletions

616
tests/test_wiki_hygiene.py Normal file
View File

@@ -0,0 +1,616 @@
"""Integration tests for scripts/wiki-hygiene.py.
Uses the tmp_wiki fixture so tests never touch the real wiki.
"""
from __future__ import annotations
from datetime import date, timedelta
from pathlib import Path
from typing import Any
import pytest
from conftest import make_conversation, make_page, make_staging_page
# ---------------------------------------------------------------------------
# Backfill last_verified
# ---------------------------------------------------------------------------
class TestBackfill:
def test_sets_last_verified_from_last_compiled(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/foo.md", last_compiled="2026-01-15")
# Strip last_verified from the fixture-built file
text = path.read_text()
text = text.replace("last_verified: 2026-04-01\n", "")
path.write_text(text)
changes = wiki_hygiene.backfill_last_verified()
assert len(changes) == 1
assert changes[0][1] == "last_compiled"
reparsed = wiki_hygiene.parse_page(path)
assert reparsed.frontmatter["last_verified"] == "2026-01-15"
def test_skips_pages_already_verified(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/done.md", last_verified="2026-04-01")
changes = wiki_hygiene.backfill_last_verified()
assert changes == []
def test_dry_run_does_not_write(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/foo.md", last_compiled="2026-01-15")
text = path.read_text().replace("last_verified: 2026-04-01\n", "")
path.write_text(text)
changes = wiki_hygiene.backfill_last_verified(dry_run=True)
assert len(changes) == 1
reparsed = wiki_hygiene.parse_page(path)
assert "last_verified" not in reparsed.frontmatter
# ---------------------------------------------------------------------------
# Confidence decay math
# ---------------------------------------------------------------------------
class TestConfidenceDecay:
def test_recent_page_unchanged(self, wiki_hygiene: Any) -> None:
recent = wiki_hygiene.today() - timedelta(days=30)
assert wiki_hygiene.expected_confidence("high", recent, False) == "high"
def test_six_months_decays_high_to_medium(self, wiki_hygiene: Any) -> None:
old = wiki_hygiene.today() - timedelta(days=200)
assert wiki_hygiene.expected_confidence("high", old, False) == "medium"
def test_nine_months_decays_medium_to_low(self, wiki_hygiene: Any) -> None:
old = wiki_hygiene.today() - timedelta(days=280)
assert wiki_hygiene.expected_confidence("medium", old, False) == "low"
def test_twelve_months_decays_to_stale(self, wiki_hygiene: Any) -> None:
old = wiki_hygiene.today() - timedelta(days=400)
assert wiki_hygiene.expected_confidence("high", old, False) == "stale"
def test_superseded_is_always_stale(self, wiki_hygiene: Any) -> None:
recent = wiki_hygiene.today() - timedelta(days=1)
assert wiki_hygiene.expected_confidence("high", recent, True) == "stale"
def test_none_date_leaves_confidence_alone(self, wiki_hygiene: Any) -> None:
assert wiki_hygiene.expected_confidence("medium", None, False) == "medium"
def test_bump_confidence_ladder(self, wiki_hygiene: Any) -> None:
assert wiki_hygiene.bump_confidence("stale") == "low"
assert wiki_hygiene.bump_confidence("low") == "medium"
assert wiki_hygiene.bump_confidence("medium") == "high"
assert wiki_hygiene.bump_confidence("high") == "high"
# ---------------------------------------------------------------------------
# Frontmatter repair
# ---------------------------------------------------------------------------
class TestFrontmatterRepair:
def test_adds_missing_confidence(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = tmp_wiki / "patterns" / "no-conf.md"
path.write_text(
"---\ntitle: No Confidence\ntype: pattern\n"
"last_compiled: 2026-04-01\nlast_verified: 2026-04-01\n---\n"
"# Body\n\nSubstantive content here for testing purposes.\n"
)
changes = wiki_hygiene.repair_frontmatter()
assert any("confidence" in fields for _, fields in changes)
reparsed = wiki_hygiene.parse_page(path)
assert reparsed.frontmatter["confidence"] == "medium"
def test_fixes_invalid_confidence(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/bad-conf.md", confidence="wat")
changes = wiki_hygiene.repair_frontmatter()
assert any(p == path for p, _ in changes)
reparsed = wiki_hygiene.parse_page(path)
assert reparsed.frontmatter["confidence"] == "medium"
def test_leaves_valid_pages_alone(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/good.md")
changes = wiki_hygiene.repair_frontmatter()
assert changes == []
# ---------------------------------------------------------------------------
# Archive and restore round-trip
# ---------------------------------------------------------------------------
class TestArchiveRestore:
def test_archive_moves_file_and_updates_frontmatter(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/doomed.md")
page = wiki_hygiene.parse_page(path)
wiki_hygiene.archive_page(page, "test archive")
assert not path.exists()
archived = tmp_wiki / "archive" / "patterns" / "doomed.md"
assert archived.exists()
reparsed = wiki_hygiene.parse_page(archived)
assert reparsed.frontmatter["archived_reason"] == "test archive"
assert reparsed.frontmatter["original_path"] == "patterns/doomed.md"
assert reparsed.frontmatter["confidence"] == "stale"
def test_restore_reverses_archive(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
original = make_page(tmp_wiki, "patterns/zombie.md")
page = wiki_hygiene.parse_page(original)
wiki_hygiene.archive_page(page, "test")
archived = tmp_wiki / "archive" / "patterns" / "zombie.md"
archived_page = wiki_hygiene.parse_page(archived)
wiki_hygiene.restore_page(archived_page)
assert original.exists()
assert not archived.exists()
reparsed = wiki_hygiene.parse_page(original)
assert reparsed.frontmatter["confidence"] == "medium"
assert "archived_date" not in reparsed.frontmatter
assert "archived_reason" not in reparsed.frontmatter
assert "original_path" not in reparsed.frontmatter
def test_archive_rejects_non_live_pages(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
# Page outside the live content dirs — should refuse to archive
weird = tmp_wiki / "raw" / "weird.md"
weird.parent.mkdir(parents=True, exist_ok=True)
weird.write_text("---\ntitle: Weird\n---\nBody\n")
page = wiki_hygiene.parse_page(weird)
result = wiki_hygiene.archive_page(page, "test")
assert result is None
def test_archive_dry_run_does_not_move(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/safe.md")
page = wiki_hygiene.parse_page(path)
wiki_hygiene.archive_page(page, "test", dry_run=True)
assert path.exists()
assert not (tmp_wiki / "archive" / "patterns" / "safe.md").exists()
# ---------------------------------------------------------------------------
# Orphan detection
# ---------------------------------------------------------------------------
class TestOrphanDetection:
def test_finds_orphan_page(self, wiki_hygiene: Any, tmp_wiki: Path) -> None:
make_page(tmp_wiki, "patterns/lonely.md")
orphans = wiki_hygiene.find_orphan_pages()
assert len(orphans) == 1
assert orphans[0].path.stem == "lonely"
def test_page_referenced_in_index_is_not_orphan(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/linked.md")
idx = tmp_wiki / "index.md"
idx.write_text(idx.read_text() + "- [Linked](patterns/linked.md) — desc\n")
orphans = wiki_hygiene.find_orphan_pages()
assert not any(p.path.stem == "linked" for p in orphans)
def test_page_referenced_in_related_is_not_orphan(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/referenced.md")
make_page(
tmp_wiki,
"patterns/referencer.md",
related=["patterns/referenced.md"],
)
orphans = wiki_hygiene.find_orphan_pages()
stems = {p.path.stem for p in orphans}
assert "referenced" not in stems
def test_fix_orphan_adds_to_index(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/orphan.md", title="Orphan Test")
page = wiki_hygiene.parse_page(path)
wiki_hygiene.fix_orphan_page(page)
idx_text = (tmp_wiki / "index.md").read_text()
assert "patterns/orphan.md" in idx_text
# ---------------------------------------------------------------------------
# Broken cross-references
# ---------------------------------------------------------------------------
class TestBrokenCrossRefs:
def test_detects_broken_link(self, wiki_hygiene: Any, tmp_wiki: Path) -> None:
make_page(
tmp_wiki,
"patterns/source.md",
body="See [nonexistent](patterns/does-not-exist.md) for details.\n",
)
broken = wiki_hygiene.find_broken_cross_refs()
assert len(broken) == 1
target, bad, suggested = broken[0]
assert bad == "patterns/does-not-exist.md"
def test_fuzzy_match_finds_near_miss(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/health-endpoint.md")
make_page(
tmp_wiki,
"patterns/source.md",
body="See [H](patterns/health-endpoints.md) — typo.\n",
)
broken = wiki_hygiene.find_broken_cross_refs()
assert len(broken) >= 1
_, bad, suggested = broken[0]
assert suggested == "patterns/health-endpoint.md"
def test_fix_broken_xref(self, wiki_hygiene: Any, tmp_wiki: Path) -> None:
make_page(tmp_wiki, "patterns/health-endpoint.md")
src = make_page(
tmp_wiki,
"patterns/source.md",
body="See [H](patterns/health-endpoints.md).\n",
)
broken = wiki_hygiene.find_broken_cross_refs()
for target, bad, suggested in broken:
wiki_hygiene.fix_broken_cross_ref(target, bad, suggested)
text = src.read_text()
assert "patterns/health-endpoints.md" not in text
assert "patterns/health-endpoint.md" in text
def test_archived_link_triggers_restore(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
# Page in archive, referenced by a live page
make_page(
tmp_wiki,
"archive/patterns/ghost.md",
confidence="stale",
extra_fm={
"archived_date": "2026-01-01",
"archived_reason": "test",
"original_path": "patterns/ghost.md",
},
)
make_page(
tmp_wiki,
"patterns/caller.md",
body="See [ghost](patterns/ghost.md).\n",
)
broken = wiki_hygiene.find_broken_cross_refs()
assert len(broken) >= 1
for target, bad, suggested in broken:
if suggested and suggested.startswith("__RESTORE__"):
wiki_hygiene.fix_broken_cross_ref(target, bad, suggested)
# After restore, ghost should be live again
assert (tmp_wiki / "patterns" / "ghost.md").exists()
# ---------------------------------------------------------------------------
# Index drift
# ---------------------------------------------------------------------------
class TestIndexDrift:
def test_finds_page_missing_from_index(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/missing.md")
missing, stale = wiki_hygiene.find_index_drift()
assert "patterns/missing.md" in missing
assert stale == []
def test_finds_stale_index_entry(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
idx = tmp_wiki / "index.md"
idx.write_text(
idx.read_text()
+ "- [Ghost](patterns/ghost.md) — page that no longer exists\n"
)
missing, stale = wiki_hygiene.find_index_drift()
assert "patterns/ghost.md" in stale
def test_fix_adds_missing_and_removes_stale(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/new.md")
idx = tmp_wiki / "index.md"
idx.write_text(
idx.read_text()
+ "- [Gone](patterns/gone.md) — deleted page\n"
)
missing, stale = wiki_hygiene.find_index_drift()
wiki_hygiene.fix_index_drift(missing, stale)
idx_text = idx.read_text()
assert "patterns/new.md" in idx_text
assert "patterns/gone.md" not in idx_text
# ---------------------------------------------------------------------------
# Empty stubs
# ---------------------------------------------------------------------------
class TestEmptyStubs:
def test_flags_small_body(self, wiki_hygiene: Any, tmp_wiki: Path) -> None:
make_page(tmp_wiki, "patterns/stub.md", body="# Stub\n\nShort.\n")
stubs = wiki_hygiene.find_empty_stubs()
assert len(stubs) == 1
assert stubs[0].path.stem == "stub"
def test_ignores_substantive_pages(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
body = "# Full\n\n" + ("This is substantive content. " * 20) + "\n"
make_page(tmp_wiki, "patterns/full.md", body=body)
stubs = wiki_hygiene.find_empty_stubs()
assert stubs == []
# ---------------------------------------------------------------------------
# Conversation refresh signals
# ---------------------------------------------------------------------------
class TestConversationRefreshSignals:
def test_picks_up_related_link(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/hot.md", last_verified="2026-01-01")
make_conversation(
tmp_wiki,
"test",
"2026-04-11-abc.md",
date="2026-04-11",
related=["patterns/hot.md"],
)
refs = wiki_hygiene.scan_conversation_references()
assert "patterns/hot.md" in refs
assert refs["patterns/hot.md"] == date(2026, 4, 11)
def test_apply_refresh_updates_last_verified(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/hot.md", last_verified="2026-01-01")
make_conversation(
tmp_wiki,
"test",
"2026-04-11-abc.md",
date="2026-04-11",
related=["patterns/hot.md"],
)
refs = wiki_hygiene.scan_conversation_references()
changes = wiki_hygiene.apply_refresh_signals(refs)
assert len(changes) == 1
reparsed = wiki_hygiene.parse_page(path)
assert reparsed.frontmatter["last_verified"] == "2026-04-11"
def test_bumps_low_confidence_to_medium(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(
tmp_wiki,
"patterns/reviving.md",
confidence="low",
last_verified="2026-01-01",
)
make_conversation(
tmp_wiki,
"test",
"2026-04-11-ref.md",
date="2026-04-11",
related=["patterns/reviving.md"],
)
refs = wiki_hygiene.scan_conversation_references()
wiki_hygiene.apply_refresh_signals(refs)
reparsed = wiki_hygiene.parse_page(path)
assert reparsed.frontmatter["confidence"] == "medium"
# ---------------------------------------------------------------------------
# Auto-restore
# ---------------------------------------------------------------------------
class TestAutoRestore:
def test_restores_page_referenced_in_conversation(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
# Archive a page
path = make_page(tmp_wiki, "patterns/returning.md")
page = wiki_hygiene.parse_page(path)
wiki_hygiene.archive_page(page, "aging out")
assert (tmp_wiki / "archive" / "patterns" / "returning.md").exists()
# Reference it in a conversation
make_conversation(
tmp_wiki,
"test",
"2026-04-12-ref.md",
related=["patterns/returning.md"],
)
# Auto-restore
restored = wiki_hygiene.auto_restore_archived()
assert len(restored) == 1
assert (tmp_wiki / "patterns" / "returning.md").exists()
assert not (tmp_wiki / "archive" / "patterns" / "returning.md").exists()
# ---------------------------------------------------------------------------
# Staging / archive index sync
# ---------------------------------------------------------------------------
class TestIndexSync:
def test_staging_sync_regenerates_index(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_staging_page(tmp_wiki, "patterns/pending.md")
changed = wiki_hygiene.sync_staging_index()
assert changed is True
text = (tmp_wiki / "staging" / "index.md").read_text()
assert "pending.md" in text
def test_staging_sync_idempotent(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_staging_page(tmp_wiki, "patterns/pending.md")
wiki_hygiene.sync_staging_index()
changed_second = wiki_hygiene.sync_staging_index()
assert changed_second is False
def test_archive_sync_regenerates_index(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(
tmp_wiki,
"archive/patterns/old.md",
confidence="stale",
extra_fm={
"archived_date": "2026-01-01",
"archived_reason": "test",
"original_path": "patterns/old.md",
},
)
changed = wiki_hygiene.sync_archive_index()
assert changed is True
text = (tmp_wiki / "archive" / "index.md").read_text()
assert "old" in text.lower()
# ---------------------------------------------------------------------------
# State drift detection
# ---------------------------------------------------------------------------
class TestStateDrift:
def test_detects_missing_raw_file(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
import json
state = {
"harvested_urls": {
"https://example.com": {
"raw_file": "raw/harvested/missing.md",
"wiki_pages": [],
}
}
}
(tmp_wiki / ".harvest-state.json").write_text(json.dumps(state))
issues = wiki_hygiene.find_state_drift()
assert any("missing.md" in i for i in issues)
def test_empty_state_has_no_drift(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
# Fixture already creates an empty .harvest-state.json
issues = wiki_hygiene.find_state_drift()
assert issues == []
# ---------------------------------------------------------------------------
# Hygiene state file
# ---------------------------------------------------------------------------
class TestHygieneState:
def test_load_returns_defaults_when_missing(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
state = wiki_hygiene.load_hygiene_state()
assert state["last_quick_run"] is None
assert state["pages_checked"] == {}
def test_save_and_reload(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
state = wiki_hygiene.load_hygiene_state()
state["last_quick_run"] = "2026-04-12T00:00:00Z"
wiki_hygiene.save_hygiene_state(state)
reloaded = wiki_hygiene.load_hygiene_state()
assert reloaded["last_quick_run"] == "2026-04-12T00:00:00Z"
def test_mark_page_checked_stores_hash(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/tracked.md")
page = wiki_hygiene.parse_page(path)
state = wiki_hygiene.load_hygiene_state()
wiki_hygiene.mark_page_checked(state, page, "quick")
entry = state["pages_checked"]["patterns/tracked.md"]
assert entry["content_hash"].startswith("sha256:")
assert "last_checked_quick" in entry
def test_page_changed_since_detects_body_change(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
path = make_page(tmp_wiki, "patterns/mutable.md", body="# One\n\nOne body.\n")
page = wiki_hygiene.parse_page(path)
state = wiki_hygiene.load_hygiene_state()
wiki_hygiene.mark_page_checked(state, page, "quick")
assert not wiki_hygiene.page_changed_since(state, page, "quick")
# Mutate the body
path.write_text(path.read_text().replace("One body", "Two body"))
new_page = wiki_hygiene.parse_page(path)
assert wiki_hygiene.page_changed_since(state, new_page, "quick")
# ---------------------------------------------------------------------------
# Full quick-hygiene run end-to-end (dry-run, idempotent)
# ---------------------------------------------------------------------------
class TestRunQuickHygiene:
def test_empty_wiki_produces_empty_report(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
report = wiki_hygiene.run_quick_hygiene(dry_run=True)
assert report.backfilled == []
assert report.archived == []
def test_real_run_is_idempotent(
self, wiki_hygiene: Any, tmp_wiki: Path
) -> None:
make_page(tmp_wiki, "patterns/one.md")
make_page(tmp_wiki, "patterns/two.md")
report1 = wiki_hygiene.run_quick_hygiene()
# Second run should have 0 work
report2 = wiki_hygiene.run_quick_hygiene()
assert report2.backfilled == []
assert report2.decayed == []
assert report2.archived == []
assert report2.frontmatter_fixes == []