88 lines
2.4 KiB
Python
88 lines
2.4 KiB
Python
import json
|
|
from pathlib import Path
|
|
|
|
import fitz
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
METADATA_PATH = ROOT / "samples" / "metadata.json"
|
|
|
|
REQUIRED_TRAITS = {
|
|
"figure_density",
|
|
"formula_density",
|
|
"has_korean_path",
|
|
"layout_risk",
|
|
"mixed_scanned_text_pages",
|
|
"scanned_pages",
|
|
"table_density",
|
|
"target_regression_focus",
|
|
"text_layer_quality",
|
|
}
|
|
|
|
|
|
def _load_metadata():
|
|
with METADATA_PATH.open("r", encoding="utf-8") as handle:
|
|
return json.load(handle)
|
|
|
|
|
|
def _metadata_samples():
|
|
metadata = _load_metadata()
|
|
assert isinstance(metadata, dict)
|
|
samples = metadata.get("samples")
|
|
assert isinstance(samples, list)
|
|
return samples
|
|
|
|
|
|
def test_metadata_paths_match_current_sample_pdfs_exactly():
|
|
expected_paths = sorted(
|
|
path.as_posix()
|
|
for path in Path("samples").glob("*.pdf")
|
|
)
|
|
samples = _metadata_samples()
|
|
metadata_paths = [sample.get("path") for sample in samples]
|
|
|
|
assert sorted(metadata_paths) == expected_paths
|
|
|
|
|
|
def test_metadata_paths_are_unique():
|
|
metadata_paths = [sample.get("path") for sample in _metadata_samples()]
|
|
|
|
assert len(metadata_paths) == len(set(metadata_paths))
|
|
|
|
|
|
def test_metadata_paths_are_exact_relative_samples_pdf_paths():
|
|
for sample in _metadata_samples():
|
|
path = sample.get("path")
|
|
|
|
assert isinstance(path, str)
|
|
assert path.startswith("samples/")
|
|
assert Path(path).suffix == ".pdf"
|
|
assert not Path(path).is_absolute()
|
|
assert (ROOT / path).is_file()
|
|
|
|
|
|
def test_required_trait_fields_are_present():
|
|
for sample in _metadata_samples():
|
|
traits = sample.get("traits")
|
|
|
|
assert isinstance(traits, dict), sample.get("path")
|
|
assert REQUIRED_TRAITS <= traits.keys(), sample.get("path")
|
|
|
|
|
|
def test_page_counts_match_current_sample_pdfs():
|
|
for sample in _metadata_samples():
|
|
page_count = sample.get("page_count")
|
|
path = ROOT / sample["path"]
|
|
|
|
assert isinstance(page_count, int), sample["path"]
|
|
assert page_count > 0, sample["path"]
|
|
with fitz.open(path) as document:
|
|
assert page_count == document.page_count, sample["path"]
|
|
|
|
|
|
def test_metadata_json_is_deterministic_utf8():
|
|
raw = METADATA_PATH.read_text(encoding="utf-8")
|
|
metadata = json.loads(raw)
|
|
|
|
assert raw == json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
|