PDFToMD/tests/test_preanalysis.py

from __future__ import annotations

import json
import shutil
from pathlib import Path

from pdftomd.models import PageAnalysis, PageRange
from pdftomd.preanalysis import analyze_pdf, is_ocr_candidate, plan_page_chunks


ROOT = Path(__file__).resolve().parents[1]
METADATA_PATH = ROOT / "samples" / "metadata.json"


def _metadata_samples() -> list[dict]:
    return json.loads(METADATA_PATH.read_text(encoding="utf-8"))["samples"]


def _sample_with(**traits: object) -> dict:
    for sample in _metadata_samples():
        sample_traits = sample["traits"]
        if all(sample_traits.get(key) == value for key, value in traits.items()):
            return sample
    raise AssertionError(f"no sample matched traits: {traits}")


def test_analyze_text_heavy_sample_returns_page_facts_from_metadata() -> None:
    sample = _sample_with(text_layer_quality="good", mixed_scanned_text_pages=False)

    result = analyze_pdf(ROOT / sample["path"])

    assert result.page_count == sample["page_count"]
    assert len(result.pages) == sample["page_count"]
    assert all(isinstance(page, PageAnalysis) for page in result.pages)
    assert result.pages[0].page == 1
    assert result.pages[0].text_length > 1000
    assert result.pages[0].has_text_layer
    assert not result.pages[0].needs_ocr


def test_analyze_mixed_scanned_risk_sample_marks_metadata_scanned_pages() -> None:
    sample = _sample_with(mixed_scanned_text_pages=True)

    result = analyze_pdf(ROOT / sample["path"])
    ocr_pages = {page.page for page in result.pages if page.needs_ocr}

    assert result.page_count == sample["page_count"]
    assert set(sample["traits"]["scanned_pages"]) <= ocr_pages
    assert any(page.image_count > 0 for page in result.pages)


def test_analyze_pdf_accepts_korean_pathlib_path(tmp_path: Path) -> None:
    sample = _sample_with(has_korean_path=True, text_layer_quality="good")
    source = ROOT / sample["path"]
    target_dir = tmp_path / "한글 경로"
    target_dir.mkdir()
    target = target_dir / source.name
    shutil.copyfile(source, target)

    result = analyze_pdf(target)

    assert result.page_count == sample["page_count"]
    assert result.pages[0].has_text_layer


def test_ocr_candidate_logic_is_deterministic() -> None:
    cases = [
        (0, 0, True),
        (0, 2, True),
        (40, 0, False),
        (199, 1, True),
        (200, 1, False),
        (1000, 8, False),
    ]

    first = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
    second = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]

    assert first == [expected for _, _, expected in cases]
    assert first == second


def test_chunk_candidates_are_twenty_page_ranges_within_bounds() -> None:
    chunks = plan_page_chunks(76)

    assert chunks == (
        PageRange(1, 20),
        PageRange(21, 40),
        PageRange(41, 60),
        PageRange(61, 76),
    )
    assert all(chunk.end <= 76 for chunk in chunks)
    assert plan_page_chunks(0) == ()