Initial commit — memex
A compounding LLM-maintained knowledge wiki. Synthesis of Andrej Karpathy's persistent-wiki gist and milla-jovovich's mempalace, with an automation layer on top for conversation mining, URL harvesting, human-in-the-loop staging, staleness decay, and hygiene. Includes: - 11 pipeline scripts (extract, summarize, index, harvest, stage, hygiene, maintain, sync, + shared library) - Full docs: README, SETUP, ARCHITECTURE, DESIGN-RATIONALE, CUSTOMIZE - Example CLAUDE.md files (wiki schema + global instructions) tuned for the three-collection qmd setup - 171-test pytest suite (cross-platform, runs in ~1.3s) - MIT licensed
This commit is contained in:
323
tests/test_wiki_harvest.py
Normal file
323
tests/test_wiki_harvest.py
Normal file
@@ -0,0 +1,323 @@
|
||||
"""Unit + integration tests for scripts/wiki-harvest.py."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import make_conversation
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# URL classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestClassifyUrl:
|
||||
def test_regular_docs_site_harvest(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url("https://docs.python.org/3/library/os.html") == "harvest"
|
||||
assert wiki_harvest.classify_url("https://blog.example.com/post") == "harvest"
|
||||
|
||||
def test_github_issue_is_check(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url("https://github.com/foo/bar/issues/42") == "check"
|
||||
|
||||
def test_github_pr_is_check(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url("https://github.com/foo/bar/pull/99") == "check"
|
||||
|
||||
def test_stackoverflow_is_check(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url(
|
||||
"https://stackoverflow.com/questions/12345/title"
|
||||
) == "check"
|
||||
|
||||
def test_localhost_skip(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url("http://localhost:3000/path") == "skip"
|
||||
assert wiki_harvest.classify_url("http://localhost/foo") == "skip"
|
||||
|
||||
def test_private_ip_skip(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url("http://10.0.0.1/api") == "skip"
|
||||
assert wiki_harvest.classify_url("http://172.30.224.1:8080/v1") == "skip"
|
||||
assert wiki_harvest.classify_url("http://192.168.1.1/test") == "skip"
|
||||
assert wiki_harvest.classify_url("http://127.0.0.1:8080/foo") == "skip"
|
||||
|
||||
def test_local_and_internal_tld_skip(self, wiki_harvest: Any) -> None:
|
||||
# `.local` and `.internal` are baked into SKIP_DOMAIN_PATTERNS
|
||||
assert wiki_harvest.classify_url("https://router.local/admin") == "skip"
|
||||
assert wiki_harvest.classify_url("https://service.internal/api") == "skip"
|
||||
|
||||
def test_custom_skip_pattern_runtime(self, wiki_harvest: Any) -> None:
|
||||
# Users can append their own patterns at runtime — verify the hook works
|
||||
wiki_harvest.SKIP_DOMAIN_PATTERNS.append(r"\.mycompany\.com$")
|
||||
try:
|
||||
assert wiki_harvest.classify_url("https://git.mycompany.com/foo") == "skip"
|
||||
assert wiki_harvest.classify_url("https://docs.mycompany.com/api") == "skip"
|
||||
finally:
|
||||
wiki_harvest.SKIP_DOMAIN_PATTERNS.pop()
|
||||
|
||||
def test_atlassian_skip(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url("https://foo.atlassian.net/browse/BAR-1") == "skip"
|
||||
|
||||
def test_slack_skip(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url("https://myteam.slack.com/archives/C123") == "skip"
|
||||
|
||||
def test_github_repo_root_is_harvest(self, wiki_harvest: Any) -> None:
|
||||
# Not an issue/pr/discussion — just a repo root, might contain docs
|
||||
assert wiki_harvest.classify_url("https://github.com/foo/bar") == "harvest"
|
||||
|
||||
def test_invalid_url_skip(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.classify_url("not a url") == "skip"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Private IP detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPrivateIp:
|
||||
def test_10_range(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest._is_private_ip("10.0.0.1") is True
|
||||
assert wiki_harvest._is_private_ip("10.255.255.255") is True
|
||||
|
||||
def test_172_16_to_31_range(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest._is_private_ip("172.16.0.1") is True
|
||||
assert wiki_harvest._is_private_ip("172.31.255.255") is True
|
||||
assert wiki_harvest._is_private_ip("172.15.0.1") is False
|
||||
assert wiki_harvest._is_private_ip("172.32.0.1") is False
|
||||
|
||||
def test_192_168_range(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest._is_private_ip("192.168.0.1") is True
|
||||
assert wiki_harvest._is_private_ip("192.167.0.1") is False
|
||||
|
||||
def test_loopback(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest._is_private_ip("127.0.0.1") is True
|
||||
|
||||
def test_public_ip(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest._is_private_ip("8.8.8.8") is False
|
||||
|
||||
def test_hostname_not_ip(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest._is_private_ip("example.com") is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# URL extraction from files
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExtractUrls:
|
||||
def test_finds_urls_in_markdown(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
path = make_conversation(
|
||||
tmp_wiki,
|
||||
"test",
|
||||
"test.md",
|
||||
body="See https://docs.python.org/3/library/os.html for details.\n"
|
||||
"Also https://fastapi.tiangolo.com/tutorial/.\n",
|
||||
)
|
||||
urls = wiki_harvest.extract_urls_from_file(path)
|
||||
assert "https://docs.python.org/3/library/os.html" in urls
|
||||
assert "https://fastapi.tiangolo.com/tutorial/" in urls
|
||||
|
||||
def test_filters_asset_extensions(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
path = make_conversation(
|
||||
tmp_wiki,
|
||||
"test",
|
||||
"assets.md",
|
||||
body=(
|
||||
"Real: https://example.com/docs/article.html\n"
|
||||
"Image: https://example.com/logo.png\n"
|
||||
"Script: https://cdn.example.com/lib.js\n"
|
||||
"Font: https://fonts.example.com/face.woff2\n"
|
||||
),
|
||||
)
|
||||
urls = wiki_harvest.extract_urls_from_file(path)
|
||||
assert "https://example.com/docs/article.html" in urls
|
||||
assert not any(u.endswith(".png") for u in urls)
|
||||
assert not any(u.endswith(".js") for u in urls)
|
||||
assert not any(u.endswith(".woff2") for u in urls)
|
||||
|
||||
def test_strips_trailing_punctuation(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
path = make_conversation(
|
||||
tmp_wiki,
|
||||
"test",
|
||||
"punct.md",
|
||||
body="See https://example.com/foo. Also https://example.com/bar, and more.\n",
|
||||
)
|
||||
urls = wiki_harvest.extract_urls_from_file(path)
|
||||
assert "https://example.com/foo" in urls
|
||||
assert "https://example.com/bar" in urls
|
||||
|
||||
def test_deduplicates_within_file(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
path = make_conversation(
|
||||
tmp_wiki,
|
||||
"test",
|
||||
"dup.md",
|
||||
body=(
|
||||
"First mention: https://example.com/same\n"
|
||||
"Second mention: https://example.com/same\n"
|
||||
),
|
||||
)
|
||||
urls = wiki_harvest.extract_urls_from_file(path)
|
||||
assert urls.count("https://example.com/same") == 1
|
||||
|
||||
def test_returns_empty_for_missing_file(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
assert wiki_harvest.extract_urls_from_file(tmp_wiki / "nope.md") == []
|
||||
|
||||
def test_filters_short_urls(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
# Less than 20 chars are skipped
|
||||
path = make_conversation(
|
||||
tmp_wiki,
|
||||
"test",
|
||||
"short.md",
|
||||
body="tiny http://a.b/ and https://example.com/long-path\n",
|
||||
)
|
||||
urls = wiki_harvest.extract_urls_from_file(path)
|
||||
assert "http://a.b/" not in urls
|
||||
assert "https://example.com/long-path" in urls
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Raw filename derivation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRawFilename:
|
||||
def test_basic_url(self, wiki_harvest: Any) -> None:
|
||||
name = wiki_harvest.raw_filename_for_url("https://docs.docker.com/build/multi-stage/")
|
||||
assert name.startswith("docs-docker-com-")
|
||||
assert "build" in name and "multi-stage" in name
|
||||
assert name.endswith(".md")
|
||||
|
||||
def test_strips_www(self, wiki_harvest: Any) -> None:
|
||||
name = wiki_harvest.raw_filename_for_url("https://www.example.com/foo")
|
||||
assert "www" not in name
|
||||
|
||||
def test_root_url_uses_index(self, wiki_harvest: Any) -> None:
|
||||
name = wiki_harvest.raw_filename_for_url("https://example.com/")
|
||||
assert name == "example-com-index.md"
|
||||
|
||||
def test_long_paths_truncated(self, wiki_harvest: Any) -> None:
|
||||
long_url = "https://example.com/" + "a-very-long-segment/" * 20
|
||||
name = wiki_harvest.raw_filename_for_url(long_url)
|
||||
assert len(name) < 200
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Content validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestValidateContent:
|
||||
def test_accepts_clean_markdown(self, wiki_harvest: Any) -> None:
|
||||
content = "# Title\n\n" + ("A clean paragraph of markdown content. " * 5)
|
||||
assert wiki_harvest.validate_content(content) is True
|
||||
|
||||
def test_rejects_empty(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.validate_content("") is False
|
||||
|
||||
def test_rejects_too_short(self, wiki_harvest: Any) -> None:
|
||||
assert wiki_harvest.validate_content("# Short") is False
|
||||
|
||||
def test_rejects_html_leak(self, wiki_harvest: Any) -> None:
|
||||
content = "# Title\n\n<div class='nav'>Navigation</div>\n" + "content " * 30
|
||||
assert wiki_harvest.validate_content(content) is False
|
||||
|
||||
def test_rejects_script_tag(self, wiki_harvest: Any) -> None:
|
||||
content = "# Title\n\n<script>alert()</script>\n" + "content " * 30
|
||||
assert wiki_harvest.validate_content(content) is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# State management
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestStateManagement:
|
||||
def test_load_returns_defaults_when_file_empty(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
(tmp_wiki / ".harvest-state.json").write_text("{}")
|
||||
state = wiki_harvest.load_state()
|
||||
assert "harvested_urls" in state
|
||||
assert "skipped_urls" in state
|
||||
|
||||
def test_save_and_reload(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
state = wiki_harvest.load_state()
|
||||
state["harvested_urls"]["https://example.com"] = {
|
||||
"first_seen": "2026-04-12",
|
||||
"seen_in": ["conversations/mc/foo.md"],
|
||||
"raw_file": "raw/harvested/example.md",
|
||||
"status": "raw",
|
||||
"fetch_method": "trafilatura",
|
||||
}
|
||||
wiki_harvest.save_state(state)
|
||||
|
||||
reloaded = wiki_harvest.load_state()
|
||||
assert "https://example.com" in reloaded["harvested_urls"]
|
||||
assert reloaded["last_run"] is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Raw file writer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestWriteRawFile:
|
||||
def test_writes_with_frontmatter(
|
||||
self, wiki_harvest: Any, tmp_wiki: Path
|
||||
) -> None:
|
||||
conv = make_conversation(tmp_wiki, "test", "source.md")
|
||||
raw_path = wiki_harvest.write_raw_file(
|
||||
"https://example.com/article",
|
||||
"# Article\n\nClean content.\n",
|
||||
"trafilatura",
|
||||
conv,
|
||||
)
|
||||
assert raw_path.exists()
|
||||
text = raw_path.read_text()
|
||||
assert "source_url: https://example.com/article" in text
|
||||
assert "fetch_method: trafilatura" in text
|
||||
assert "content_hash: sha256:" in text
|
||||
assert "discovered_in: conversations/test/source.md" in text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dry-run CLI smoke test (no actual fetches)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHarvestCli:
|
||||
def test_dry_run_no_network_calls(
|
||||
self, run_script, tmp_wiki: Path
|
||||
) -> None:
|
||||
make_conversation(
|
||||
tmp_wiki,
|
||||
"test",
|
||||
"test.md",
|
||||
body="See https://docs.python.org/3/ and https://github.com/foo/bar/issues/1.\n",
|
||||
)
|
||||
result = run_script("wiki-harvest.py", "--dry-run")
|
||||
assert result.returncode == 0
|
||||
# Dry-run should classify without fetching
|
||||
assert "would-harvest" in result.stdout or "Summary" in result.stdout
|
||||
|
||||
def test_help_flag(self, run_script) -> None:
|
||||
result = run_script("wiki-harvest.py", "--help")
|
||||
assert result.returncode == 0
|
||||
assert "--dry-run" in result.stdout
|
||||
assert "--no-compile" in result.stdout
|
||||
Reference in New Issue
Block a user