from __future__ import annotations import json import shutil from pathlib import Path from pdftomd.models import PageAnalysis, PageRange from pdftomd.preanalysis import analyze_pdf, is_ocr_candidate, plan_page_chunks ROOT = Path(__file__).resolve().parents[1] METADATA_PATH = ROOT / "samples" / "metadata.json" def _metadata_samples() -> list[dict]: return json.loads(METADATA_PATH.read_text(encoding="utf-8"))["samples"] def _sample_with(**traits: object) -> dict: for sample in _metadata_samples(): sample_traits = sample["traits"] if all(sample_traits.get(key) == value for key, value in traits.items()): return sample raise AssertionError(f"no sample matched traits: {traits}") def test_analyze_text_heavy_sample_returns_page_facts_from_metadata() -> None: sample = _sample_with(text_layer_quality="good", mixed_scanned_text_pages=False) result = analyze_pdf(ROOT / sample["path"]) assert result.page_count == sample["page_count"] assert len(result.pages) == sample["page_count"] assert all(isinstance(page, PageAnalysis) for page in result.pages) assert result.pages[0].page == 1 assert result.pages[0].text_length > 1000 assert result.pages[0].has_text_layer assert not result.pages[0].needs_ocr def test_analyze_mixed_scanned_risk_sample_marks_metadata_scanned_pages() -> None: sample = _sample_with(mixed_scanned_text_pages=True) result = analyze_pdf(ROOT / sample["path"]) ocr_pages = {page.page for page in result.pages if page.needs_ocr} assert result.page_count == sample["page_count"] assert set(sample["traits"]["scanned_pages"]) <= ocr_pages assert any(page.image_count > 0 for page in result.pages) def test_analyze_pdf_accepts_korean_pathlib_path(tmp_path: Path) -> None: sample = _sample_with(has_korean_path=True, text_layer_quality="good") source = ROOT / sample["path"] target_dir = tmp_path / "한글 경로" target_dir.mkdir() target = target_dir / source.name shutil.copyfile(source, target) result = analyze_pdf(target) assert result.page_count == sample["page_count"] assert result.pages[0].has_text_layer def test_ocr_candidate_logic_is_deterministic() -> None: cases = [ (0, 0, True), (0, 2, True), (40, 0, False), (199, 1, True), (200, 1, False), (1000, 8, False), ] first = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases] second = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases] assert first == [expected for _, _, expected in cases] assert first == second def test_chunk_candidates_are_twenty_page_ranges_within_bounds() -> None: chunks = plan_page_chunks(76) assert chunks == ( PageRange(1, 20), PageRange(21, 40), PageRange(41, 60), PageRange(61, 76), ) assert all(chunk.end <= 76 for chunk in chunks) assert plan_page_chunks(0) == ()