add files

2026-04-30 17:05:19 +09:00
parent f3e01b5a8c
commit 7e985ae94a
135 changed files with 41205 additions and 0 deletions
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import json
+import shutil
+from pathlib import Path
+
+from pdftomd.models import PageAnalysis, PageRange
+from pdftomd.preanalysis import analyze_pdf, is_ocr_candidate, plan_page_chunks
+
+
+ROOT = Path(__file__).resolve().parents[1]
+METADATA_PATH = ROOT / "samples" / "metadata.json"
+
+
+def _metadata_samples() -> list[dict]:
+    return json.loads(METADATA_PATH.read_text(encoding="utf-8"))["samples"]
+
+
+def _sample_with(**traits: object) -> dict:
+    for sample in _metadata_samples():
+        sample_traits = sample["traits"]
+        if all(sample_traits.get(key) == value for key, value in traits.items()):
+            return sample
+    raise AssertionError(f"no sample matched traits: {traits}")
+
+
+def test_analyze_text_heavy_sample_returns_page_facts_from_metadata() -> None:
+    sample = _sample_with(text_layer_quality="good", mixed_scanned_text_pages=False)
+
+    result = analyze_pdf(ROOT / sample["path"])
+
+    assert result.page_count == sample["page_count"]
+    assert len(result.pages) == sample["page_count"]
+    assert all(isinstance(page, PageAnalysis) for page in result.pages)
+    assert result.pages[0].page == 1
+    assert result.pages[0].text_length > 1000
+    assert result.pages[0].has_text_layer
+    assert not result.pages[0].needs_ocr
+
+
+def test_analyze_mixed_scanned_risk_sample_marks_metadata_scanned_pages() -> None:
+    sample = _sample_with(mixed_scanned_text_pages=True)
+
+    result = analyze_pdf(ROOT / sample["path"])
+    ocr_pages = {page.page for page in result.pages if page.needs_ocr}
+
+    assert result.page_count == sample["page_count"]
+    assert set(sample["traits"]["scanned_pages"]) <= ocr_pages
+    assert any(page.image_count > 0 for page in result.pages)
+
+
+def test_analyze_pdf_accepts_korean_pathlib_path(tmp_path: Path) -> None:
+    sample = _sample_with(has_korean_path=True, text_layer_quality="good")
+    source = ROOT / sample["path"]
+    target_dir = tmp_path / "한글 경로"
+    target_dir.mkdir()
+    target = target_dir / source.name
+    shutil.copyfile(source, target)
+
+    result = analyze_pdf(target)
+
+    assert result.page_count == sample["page_count"]
+    assert result.pages[0].has_text_layer
+
+
+def test_ocr_candidate_logic_is_deterministic() -> None:
+    cases = [
+        (0, 0, True),
+        (0, 2, True),
+        (40, 0, False),
+        (199, 1, True),
+        (200, 1, False),
+        (1000, 8, False),
+    ]
+
+    first = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
+    second = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
+
+    assert first == [expected for _, _, expected in cases]
+    assert first == second
+
+
+def test_chunk_candidates_are_twenty_page_ranges_within_bounds() -> None:
+    chunks = plan_page_chunks(76)
+
+    assert chunks == (
+        PageRange(1, 20),
+        PageRange(21, 40),
+        PageRange(41, 60),
+        PageRange(61, 76),
+    )
+    assert all(chunk.end <= 76 for chunk in chunks)
+    assert plan_page_chunks(0) == ()