add files

This commit is contained in:
김경종
2026-04-30 17:05:19 +09:00
parent f3e01b5a8c
commit 7e985ae94a
135 changed files with 41205 additions and 0 deletions
+93
View File
@@ -0,0 +1,93 @@
from __future__ import annotations
import json
import shutil
from pathlib import Path
from pdftomd.models import PageAnalysis, PageRange
from pdftomd.preanalysis import analyze_pdf, is_ocr_candidate, plan_page_chunks
ROOT = Path(__file__).resolve().parents[1]
METADATA_PATH = ROOT / "samples" / "metadata.json"
def _metadata_samples() -> list[dict]:
return json.loads(METADATA_PATH.read_text(encoding="utf-8"))["samples"]
def _sample_with(**traits: object) -> dict:
for sample in _metadata_samples():
sample_traits = sample["traits"]
if all(sample_traits.get(key) == value for key, value in traits.items()):
return sample
raise AssertionError(f"no sample matched traits: {traits}")
def test_analyze_text_heavy_sample_returns_page_facts_from_metadata() -> None:
sample = _sample_with(text_layer_quality="good", mixed_scanned_text_pages=False)
result = analyze_pdf(ROOT / sample["path"])
assert result.page_count == sample["page_count"]
assert len(result.pages) == sample["page_count"]
assert all(isinstance(page, PageAnalysis) for page in result.pages)
assert result.pages[0].page == 1
assert result.pages[0].text_length > 1000
assert result.pages[0].has_text_layer
assert not result.pages[0].needs_ocr
def test_analyze_mixed_scanned_risk_sample_marks_metadata_scanned_pages() -> None:
sample = _sample_with(mixed_scanned_text_pages=True)
result = analyze_pdf(ROOT / sample["path"])
ocr_pages = {page.page for page in result.pages if page.needs_ocr}
assert result.page_count == sample["page_count"]
assert set(sample["traits"]["scanned_pages"]) <= ocr_pages
assert any(page.image_count > 0 for page in result.pages)
def test_analyze_pdf_accepts_korean_pathlib_path(tmp_path: Path) -> None:
sample = _sample_with(has_korean_path=True, text_layer_quality="good")
source = ROOT / sample["path"]
target_dir = tmp_path / "한글 경로"
target_dir.mkdir()
target = target_dir / source.name
shutil.copyfile(source, target)
result = analyze_pdf(target)
assert result.page_count == sample["page_count"]
assert result.pages[0].has_text_layer
def test_ocr_candidate_logic_is_deterministic() -> None:
cases = [
(0, 0, True),
(0, 2, True),
(40, 0, False),
(199, 1, True),
(200, 1, False),
(1000, 8, False),
]
first = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
second = [is_ocr_candidate(text_length, image_count) for text_length, image_count, _ in cases]
assert first == [expected for _, _, expected in cases]
assert first == second
def test_chunk_candidates_are_twenty_page_ranges_within_bounds() -> None:
chunks = plan_page_chunks(76)
assert chunks == (
PageRange(1, 20),
PageRange(21, 40),
PageRange(41, 60),
PageRange(61, 76),
)
assert all(chunk.end <= 76 for chunk in chunks)
assert plan_page_chunks(0) == ()