add files
This commit is contained in:
@@ -0,0 +1,87 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import fitz
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
METADATA_PATH = ROOT / "samples" / "metadata.json"
|
||||
|
||||
REQUIRED_TRAITS = {
|
||||
"figure_density",
|
||||
"formula_density",
|
||||
"has_korean_path",
|
||||
"layout_risk",
|
||||
"mixed_scanned_text_pages",
|
||||
"scanned_pages",
|
||||
"table_density",
|
||||
"target_regression_focus",
|
||||
"text_layer_quality",
|
||||
}
|
||||
|
||||
|
||||
def _load_metadata():
|
||||
with METADATA_PATH.open("r", encoding="utf-8") as handle:
|
||||
return json.load(handle)
|
||||
|
||||
|
||||
def _metadata_samples():
|
||||
metadata = _load_metadata()
|
||||
assert isinstance(metadata, dict)
|
||||
samples = metadata.get("samples")
|
||||
assert isinstance(samples, list)
|
||||
return samples
|
||||
|
||||
|
||||
def test_metadata_paths_match_current_sample_pdfs_exactly():
|
||||
expected_paths = sorted(
|
||||
path.as_posix()
|
||||
for path in Path("samples").glob("*.pdf")
|
||||
)
|
||||
samples = _metadata_samples()
|
||||
metadata_paths = [sample.get("path") for sample in samples]
|
||||
|
||||
assert sorted(metadata_paths) == expected_paths
|
||||
|
||||
|
||||
def test_metadata_paths_are_unique():
|
||||
metadata_paths = [sample.get("path") for sample in _metadata_samples()]
|
||||
|
||||
assert len(metadata_paths) == len(set(metadata_paths))
|
||||
|
||||
|
||||
def test_metadata_paths_are_exact_relative_samples_pdf_paths():
|
||||
for sample in _metadata_samples():
|
||||
path = sample.get("path")
|
||||
|
||||
assert isinstance(path, str)
|
||||
assert path.startswith("samples/")
|
||||
assert Path(path).suffix == ".pdf"
|
||||
assert not Path(path).is_absolute()
|
||||
assert (ROOT / path).is_file()
|
||||
|
||||
|
||||
def test_required_trait_fields_are_present():
|
||||
for sample in _metadata_samples():
|
||||
traits = sample.get("traits")
|
||||
|
||||
assert isinstance(traits, dict), sample.get("path")
|
||||
assert REQUIRED_TRAITS <= traits.keys(), sample.get("path")
|
||||
|
||||
|
||||
def test_page_counts_match_current_sample_pdfs():
|
||||
for sample in _metadata_samples():
|
||||
page_count = sample.get("page_count")
|
||||
path = ROOT / sample["path"]
|
||||
|
||||
assert isinstance(page_count, int), sample["path"]
|
||||
assert page_count > 0, sample["path"]
|
||||
with fitz.open(path) as document:
|
||||
assert page_count == document.page_count, sample["path"]
|
||||
|
||||
|
||||
def test_metadata_json_is_deterministic_utf8():
|
||||
raw = METADATA_PATH.read_text(encoding="utf-8")
|
||||
metadata = json.loads(raw)
|
||||
|
||||
assert raw == json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
|
||||
Reference in New Issue
Block a user