Files
PDFToMD/tests/test_sample_metadata.py
T
김경종 7e985ae94a add files
2026-04-30 17:05:19 +09:00

88 lines
2.4 KiB
Python

import json
from pathlib import Path
import fitz
ROOT = Path(__file__).resolve().parents[1]
METADATA_PATH = ROOT / "samples" / "metadata.json"
REQUIRED_TRAITS = {
"figure_density",
"formula_density",
"has_korean_path",
"layout_risk",
"mixed_scanned_text_pages",
"scanned_pages",
"table_density",
"target_regression_focus",
"text_layer_quality",
}
def _load_metadata():
with METADATA_PATH.open("r", encoding="utf-8") as handle:
return json.load(handle)
def _metadata_samples():
metadata = _load_metadata()
assert isinstance(metadata, dict)
samples = metadata.get("samples")
assert isinstance(samples, list)
return samples
def test_metadata_paths_match_current_sample_pdfs_exactly():
expected_paths = sorted(
path.as_posix()
for path in Path("samples").glob("*.pdf")
)
samples = _metadata_samples()
metadata_paths = [sample.get("path") for sample in samples]
assert sorted(metadata_paths) == expected_paths
def test_metadata_paths_are_unique():
metadata_paths = [sample.get("path") for sample in _metadata_samples()]
assert len(metadata_paths) == len(set(metadata_paths))
def test_metadata_paths_are_exact_relative_samples_pdf_paths():
for sample in _metadata_samples():
path = sample.get("path")
assert isinstance(path, str)
assert path.startswith("samples/")
assert Path(path).suffix == ".pdf"
assert not Path(path).is_absolute()
assert (ROOT / path).is_file()
def test_required_trait_fields_are_present():
for sample in _metadata_samples():
traits = sample.get("traits")
assert isinstance(traits, dict), sample.get("path")
assert REQUIRED_TRAITS <= traits.keys(), sample.get("path")
def test_page_counts_match_current_sample_pdfs():
for sample in _metadata_samples():
page_count = sample.get("page_count")
path = ROOT / sample["path"]
assert isinstance(page_count, int), sample["path"]
assert page_count > 0, sample["path"]
with fitz.open(path) as document:
assert page_count == document.page_count, sample["path"]
def test_metadata_json_is_deterministic_utf8():
raw = METADATA_PATH.read_text(encoding="utf-8")
metadata = json.loads(raw)
assert raw == json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"