94 lines
3.0 KiB
Python
94 lines
3.0 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from pdftomd.models import PageAnalysis, PageRange
|
|
from pdftomd.preanalysis import analyze_pdf, is_ocr_candidate, plan_page_chunks
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
METADATA_PATH = ROOT / "samples" / "metadata.json"
|
|
|
|
|
|
def _metadata_samples() -> list[dict]:
|
|
return json.loads(METADATA_PATH.read_text(encoding="utf-8"))["samples"]
|
|
|
|
|
|
def _sample_with(**traits: object) -> dict:
|
|
for sample in _metadata_samples():
|
|
sample_traits = sample["traits"]
|
|
if all(sample_traits.get(key) == value for key, value in traits.items()):
|
|
return sample
|
|
raise AssertionError(f"no sample matched traits: {traits}")
|
|
|
|
|
|
def test_analyze_text_heavy_sample_returns_page_facts_from_metadata() -> None:
|
|
sample = _sample_with(text_layer_quality="good", mixed_scanned_text_pages=False)
|
|
|
|
result = analyze_pdf(ROOT / sample["path"])
|
|
|
|
assert result.page_count == sample["page_count"]
|
|
assert len(result.pages) == sample["page_count"]
|
|
assert all(isinstance(page, PageAnalysis) for page in result.pages)
|
|
assert result.pages[0].page == 1
|
|
assert result.pages[0].text_length > 1000
|
|
assert result.pages[0].has_text_layer
|
|
assert not result.pages[0].needs_ocr
|
|
|
|
|
|
def test_analyze_mixed_scanned_risk_sample_marks_metadata_scanned_pages() -> None:
|
|
sample = _sample_with(mixed_scanned_text_pages=True)
|
|
|
|
result = analyze_pdf(ROOT / sample["path"])
|
|
ocr_pages = {page.page for page in result.pages if page.needs_ocr}
|
|
|
|
assert result.page_count == sample["page_count"]
|
|
assert set(sample["traits"]["scanned_pages"]) <= ocr_pages
|
|
assert any(page.image_count > 0 for page in result.pages)
|
|
|
|
|
|
def test_analyze_pdf_accepts_korean_pathlib_path(tmp_path: Path) -> None:
|
|
sample = _sample_with(has_korean_path=True, text_layer_quality="good")
|
|
source = ROOT / sample["path"]
|
|
target_dir = tmp_path / "한글 경로"
|
|
target_dir.mkdir()
|
|
target = target_dir / source.name
|
|
shutil.copyfile(source, target)
|
|
|
|
result = analyze_pdf(target)
|
|
|
|
assert result.page_count == sample["page_count"]
|
|
assert result.pages[0].has_text_layer
|
|
|
|
|
|
def test_ocr_candidate_logic_is_deterministic() -> None:
|
|
cases = [
|
|
(0, 0, True),
|
|
(0, 2, True),
|
|
(40, 0, False),
|
|
(199, 1, True),
|
|
(200, 1, False),
|
|
(1000, 8, False),
|
|
]
|
|
|
|
first = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
|
|
second = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
|
|
|
|
assert first == [expected for _, _, expected in cases]
|
|
assert first == second
|
|
|
|
|
|
def test_chunk_candidates_are_twenty_page_ranges_within_bounds() -> None:
|
|
chunks = plan_page_chunks(76)
|
|
|
|
assert chunks == (
|
|
PageRange(1, 20),
|
|
PageRange(21, 40),
|
|
PageRange(41, 60),
|
|
PageRange(61, 76),
|
|
)
|
|
assert all(chunk.end <= 76 for chunk in chunks)
|
|
assert plan_page_chunks(0) == ()
|