"""Unit + integration tests for scripts/wiki-harvest.py.""" from __future__ import annotations import json from pathlib import Path from typing import Any from unittest.mock import patch import pytest from conftest import make_conversation # --------------------------------------------------------------------------- # URL classification # --------------------------------------------------------------------------- class TestClassifyUrl: def test_regular_docs_site_harvest(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url("https://docs.python.org/3/library/os.html") == "harvest" assert wiki_harvest.classify_url("https://blog.example.com/post") == "harvest" def test_github_issue_is_check(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url("https://github.com/foo/bar/issues/42") == "check" def test_github_pr_is_check(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url("https://github.com/foo/bar/pull/99") == "check" def test_stackoverflow_is_check(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url( "https://stackoverflow.com/questions/12345/title" ) == "check" def test_localhost_skip(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url("http://localhost:3000/path") == "skip" assert wiki_harvest.classify_url("http://localhost/foo") == "skip" def test_private_ip_skip(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url("http://10.0.0.1/api") == "skip" assert wiki_harvest.classify_url("http://172.30.224.1:8080/v1") == "skip" assert wiki_harvest.classify_url("http://192.168.1.1/test") == "skip" assert wiki_harvest.classify_url("http://127.0.0.1:8080/foo") == "skip" def test_local_and_internal_tld_skip(self, wiki_harvest: Any) -> None: # `.local` and `.internal` are baked into SKIP_DOMAIN_PATTERNS assert wiki_harvest.classify_url("https://router.local/admin") == "skip" assert wiki_harvest.classify_url("https://service.internal/api") == "skip" def test_custom_skip_pattern_runtime(self, wiki_harvest: Any) -> None: # Users can append their own patterns at runtime — verify the hook works wiki_harvest.SKIP_DOMAIN_PATTERNS.append(r"\.mycompany\.com$") try: assert wiki_harvest.classify_url("https://git.mycompany.com/foo") == "skip" assert wiki_harvest.classify_url("https://docs.mycompany.com/api") == "skip" finally: wiki_harvest.SKIP_DOMAIN_PATTERNS.pop() def test_atlassian_skip(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url("https://foo.atlassian.net/browse/BAR-1") == "skip" def test_slack_skip(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url("https://myteam.slack.com/archives/C123") == "skip" def test_github_repo_root_is_harvest(self, wiki_harvest: Any) -> None: # Not an issue/pr/discussion — just a repo root, might contain docs assert wiki_harvest.classify_url("https://github.com/foo/bar") == "harvest" def test_invalid_url_skip(self, wiki_harvest: Any) -> None: assert wiki_harvest.classify_url("not a url") == "skip" # --------------------------------------------------------------------------- # Private IP detection # --------------------------------------------------------------------------- class TestPrivateIp: def test_10_range(self, wiki_harvest: Any) -> None: assert wiki_harvest._is_private_ip("10.0.0.1") is True assert wiki_harvest._is_private_ip("10.255.255.255") is True def test_172_16_to_31_range(self, wiki_harvest: Any) -> None: assert wiki_harvest._is_private_ip("172.16.0.1") is True assert wiki_harvest._is_private_ip("172.31.255.255") is True assert wiki_harvest._is_private_ip("172.15.0.1") is False assert wiki_harvest._is_private_ip("172.32.0.1") is False def test_192_168_range(self, wiki_harvest: Any) -> None: assert wiki_harvest._is_private_ip("192.168.0.1") is True assert wiki_harvest._is_private_ip("192.167.0.1") is False def test_loopback(self, wiki_harvest: Any) -> None: assert wiki_harvest._is_private_ip("127.0.0.1") is True def test_public_ip(self, wiki_harvest: Any) -> None: assert wiki_harvest._is_private_ip("8.8.8.8") is False def test_hostname_not_ip(self, wiki_harvest: Any) -> None: assert wiki_harvest._is_private_ip("example.com") is False # --------------------------------------------------------------------------- # URL extraction from files # --------------------------------------------------------------------------- class TestExtractUrls: def test_finds_urls_in_markdown( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: path = make_conversation( tmp_wiki, "test", "test.md", body="See https://docs.python.org/3/library/os.html for details.\n" "Also https://fastapi.tiangolo.com/tutorial/.\n", ) urls = wiki_harvest.extract_urls_from_file(path) assert "https://docs.python.org/3/library/os.html" in urls assert "https://fastapi.tiangolo.com/tutorial/" in urls def test_filters_asset_extensions( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: path = make_conversation( tmp_wiki, "test", "assets.md", body=( "Real: https://example.com/docs/article.html\n" "Image: https://example.com/logo.png\n" "Script: https://cdn.example.com/lib.js\n" "Font: https://fonts.example.com/face.woff2\n" ), ) urls = wiki_harvest.extract_urls_from_file(path) assert "https://example.com/docs/article.html" in urls assert not any(u.endswith(".png") for u in urls) assert not any(u.endswith(".js") for u in urls) assert not any(u.endswith(".woff2") for u in urls) def test_strips_trailing_punctuation( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: path = make_conversation( tmp_wiki, "test", "punct.md", body="See https://example.com/foo. Also https://example.com/bar, and more.\n", ) urls = wiki_harvest.extract_urls_from_file(path) assert "https://example.com/foo" in urls assert "https://example.com/bar" in urls def test_deduplicates_within_file( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: path = make_conversation( tmp_wiki, "test", "dup.md", body=( "First mention: https://example.com/same\n" "Second mention: https://example.com/same\n" ), ) urls = wiki_harvest.extract_urls_from_file(path) assert urls.count("https://example.com/same") == 1 def test_returns_empty_for_missing_file( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: assert wiki_harvest.extract_urls_from_file(tmp_wiki / "nope.md") == [] def test_filters_short_urls( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: # Less than 20 chars are skipped path = make_conversation( tmp_wiki, "test", "short.md", body="tiny http://a.b/ and https://example.com/long-path\n", ) urls = wiki_harvest.extract_urls_from_file(path) assert "http://a.b/" not in urls assert "https://example.com/long-path" in urls # --------------------------------------------------------------------------- # Raw filename derivation # --------------------------------------------------------------------------- class TestRawFilename: def test_basic_url(self, wiki_harvest: Any) -> None: name = wiki_harvest.raw_filename_for_url("https://docs.docker.com/build/multi-stage/") assert name.startswith("docs-docker-com-") assert "build" in name and "multi-stage" in name assert name.endswith(".md") def test_strips_www(self, wiki_harvest: Any) -> None: name = wiki_harvest.raw_filename_for_url("https://www.example.com/foo") assert "www" not in name def test_root_url_uses_index(self, wiki_harvest: Any) -> None: name = wiki_harvest.raw_filename_for_url("https://example.com/") assert name == "example-com-index.md" def test_long_paths_truncated(self, wiki_harvest: Any) -> None: long_url = "https://example.com/" + "a-very-long-segment/" * 20 name = wiki_harvest.raw_filename_for_url(long_url) assert len(name) < 200 # --------------------------------------------------------------------------- # Content validation # --------------------------------------------------------------------------- class TestValidateContent: def test_accepts_clean_markdown(self, wiki_harvest: Any) -> None: content = "# Title\n\n" + ("A clean paragraph of markdown content. " * 5) assert wiki_harvest.validate_content(content) is True def test_rejects_empty(self, wiki_harvest: Any) -> None: assert wiki_harvest.validate_content("") is False def test_rejects_too_short(self, wiki_harvest: Any) -> None: assert wiki_harvest.validate_content("# Short") is False def test_rejects_html_leak(self, wiki_harvest: Any) -> None: content = "# Title\n\n\n" + "content " * 30 assert wiki_harvest.validate_content(content) is False def test_rejects_script_tag(self, wiki_harvest: Any) -> None: content = "# Title\n\n\n" + "content " * 30 assert wiki_harvest.validate_content(content) is False # --------------------------------------------------------------------------- # State management # --------------------------------------------------------------------------- class TestStateManagement: def test_load_returns_defaults_when_file_empty( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: (tmp_wiki / ".harvest-state.json").write_text("{}") state = wiki_harvest.load_state() assert "harvested_urls" in state assert "skipped_urls" in state def test_save_and_reload( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: state = wiki_harvest.load_state() state["harvested_urls"]["https://example.com"] = { "first_seen": "2026-04-12", "seen_in": ["conversations/mc/foo.md"], "raw_file": "raw/harvested/example.md", "status": "raw", "fetch_method": "trafilatura", } wiki_harvest.save_state(state) reloaded = wiki_harvest.load_state() assert "https://example.com" in reloaded["harvested_urls"] assert reloaded["last_run"] is not None # --------------------------------------------------------------------------- # Raw file writer # --------------------------------------------------------------------------- class TestWriteRawFile: def test_writes_with_frontmatter( self, wiki_harvest: Any, tmp_wiki: Path ) -> None: conv = make_conversation(tmp_wiki, "test", "source.md") raw_path = wiki_harvest.write_raw_file( "https://example.com/article", "# Article\n\nClean content.\n", "trafilatura", conv, ) assert raw_path.exists() text = raw_path.read_text() assert "source_url: https://example.com/article" in text assert "fetch_method: trafilatura" in text assert "content_hash: sha256:" in text assert "discovered_in: conversations/test/source.md" in text # --------------------------------------------------------------------------- # Dry-run CLI smoke test (no actual fetches) # --------------------------------------------------------------------------- class TestHarvestCli: def test_dry_run_no_network_calls( self, run_script, tmp_wiki: Path ) -> None: make_conversation( tmp_wiki, "test", "test.md", body="See https://docs.python.org/3/ and https://github.com/foo/bar/issues/1.\n", ) result = run_script("wiki-harvest.py", "--dry-run") assert result.returncode == 0 # Dry-run should classify without fetching assert "would-harvest" in result.stdout or "Summary" in result.stdout def test_help_flag(self, run_script) -> None: result = run_script("wiki-harvest.py", "--help") assert result.returncode == 0 assert "--dry-run" in result.stdout assert "--no-compile" in result.stdout