import json from pathlib import Path import fitz ROOT = Path(__file__).resolve().parents[1] METADATA_PATH = ROOT / "samples" / "metadata.json" REQUIRED_TRAITS = { "figure_density", "formula_density", "has_korean_path", "layout_risk", "mixed_scanned_text_pages", "scanned_pages", "table_density", "target_regression_focus", "text_layer_quality", } def _load_metadata(): with METADATA_PATH.open("r", encoding="utf-8") as handle: return json.load(handle) def _metadata_samples(): metadata = _load_metadata() assert isinstance(metadata, dict) samples = metadata.get("samples") assert isinstance(samples, list) return samples def test_metadata_paths_match_current_sample_pdfs_exactly(): expected_paths = sorted( path.as_posix() for path in Path("samples").glob("*.pdf") ) samples = _metadata_samples() metadata_paths = [sample.get("path") for sample in samples] assert sorted(metadata_paths) == expected_paths def test_metadata_paths_are_unique(): metadata_paths = [sample.get("path") for sample in _metadata_samples()] assert len(metadata_paths) == len(set(metadata_paths)) def test_metadata_paths_are_exact_relative_samples_pdf_paths(): for sample in _metadata_samples(): path = sample.get("path") assert isinstance(path, str) assert path.startswith("samples/") assert Path(path).suffix == ".pdf" assert not Path(path).is_absolute() assert (ROOT / path).is_file() def test_required_trait_fields_are_present(): for sample in _metadata_samples(): traits = sample.get("traits") assert isinstance(traits, dict), sample.get("path") assert REQUIRED_TRAITS <= traits.keys(), sample.get("path") def test_page_counts_match_current_sample_pdfs(): for sample in _metadata_samples(): page_count = sample.get("page_count") path = ROOT / sample["path"] assert isinstance(page_count, int), sample["path"] assert page_count > 0, sample["path"] with fitz.open(path) as document: assert page_count == document.page_count, sample["path"] def test_metadata_json_is_deterministic_utf8(): raw = METADATA_PATH.read_text(encoding="utf-8") metadata = json.loads(raw) assert raw == json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"