from __future__ import annotations import json from pathlib import Path import pytest from pdf2md.ir import ( AssetRecord, BlockRecord, BlockType, DocumentRecord, PageRecord, WarningCode, WarningRecord, WarningSeverity, ) from pdf2md.metadata import MetadataInputError, build_metadata, build_summary def make_document(tmp_path: Path) -> DocumentRecord: page_zero = PageRecord( page_index=0, blocks=( BlockRecord(BlockType.HEADING, page_index=0), BlockRecord(BlockType.INLINE_FORMULA, page_index=0, confidence=0.98), BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, bbox=(1.0, 2.0, 3.0, 4.0)), ), ) page_one = PageRecord( page_index=1, blocks=( BlockRecord(BlockType.PARAGRAPH, page_index=1), BlockRecord(BlockType.DISPLAY_FORMULA, page_index=1), ), ) return DocumentRecord( source_pdf=tmp_path / "paper.pdf", pages=(page_zero, page_one), assets=(AssetRecord("paper.assets/figure.png", page_index=1),), warnings=( WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Check reading order.", page_index=1), WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Math failed to render.", page_index=0), ), ) def build_test_metadata(tmp_path: Path) -> dict[str, object]: return build_metadata( document=make_document(tmp_path), source_sha256="0" * 64, created_at="2026-05-07T00:00:00Z", engine="MinerU", engine_version="3.1.0", engine_options={"strict_local": True}, ) def test_metadata_has_required_top_level_fields(tmp_path: Path) -> None: metadata = build_test_metadata(tmp_path) assert set(metadata) == { "source_pdf", "source_sha256", "created_at", "engine", "engine_version", "engine_options", "pages", "assets", "warnings", "summary", } def test_metadata_summary_counts_from_records(tmp_path: Path) -> None: metadata = build_test_metadata(tmp_path) assert metadata["summary"] == { "pages_processed": 2, "warning_count": 2, "asset_count": 1, "display_formula_count": 2, "inline_formula_count": 1, "math_render_error_count": 1, } def test_warning_order_and_page_provenance_are_preserved(tmp_path: Path) -> None: metadata = build_test_metadata(tmp_path) warnings = metadata["warnings"] assert [warning["code"] for warning in warnings] == [ "READING_ORDER_UNCERTAIN", "MATH_RENDER_FAILED", ] assert warnings[0]["page_index"] == 1 assert warnings[1]["page_index"] == 0 def test_optional_bbox_and_confidence_are_preserved_only_when_present(tmp_path: Path) -> None: metadata = build_test_metadata(tmp_path) blocks = metadata["pages"][0]["blocks"] assert "confidence" not in blocks[0] assert blocks[1]["confidence"] == 0.98 assert "bbox" not in blocks[1] assert blocks[2]["bbox"] == [1.0, 2.0, 3.0, 4.0] def test_metadata_is_json_serializable(tmp_path: Path) -> None: json.dumps(build_test_metadata(tmp_path)) @pytest.mark.parametrize( ("field_name", "kwargs"), [ ("document", {"document": None}), ("source_sha256", {"source_sha256": ""}), ("created_at", {"created_at": ""}), ("engine", {"engine": ""}), ("engine_version", {"engine_version": ""}), ], ) def test_metadata_requires_core_inputs(tmp_path: Path, field_name: str, kwargs: dict[str, object]) -> None: values: dict[str, object] = { "document": make_document(tmp_path), "source_sha256": "0" * 64, "created_at": "2026-05-07T00:00:00Z", "engine": "MinerU", "engine_version": "3.1.0", } values.update(kwargs) with pytest.raises(MetadataInputError, match=field_name): build_metadata(**values) def test_engine_options_must_be_json_serializable(tmp_path: Path) -> None: with pytest.raises(MetadataInputError, match="JSON serializable"): build_metadata( document=make_document(tmp_path), source_sha256="0" * 64, created_at="2026-05-07T00:00:00Z", engine="MinerU", engine_version="3.1.0", engine_options={"path": tmp_path}, ) def test_formula_counts_come_from_block_types_not_markdown_text(tmp_path: Path) -> None: document = DocumentRecord( source_pdf=tmp_path / "paper.pdf", pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH), BlockRecord(BlockType.UNKNOWN))),), ) summary = build_summary(document) assert summary["inline_formula_count"] == 0 assert summary["display_formula_count"] == 0 def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path) -> None: document = DocumentRecord( source_pdf=tmp_path / "paper.pdf", pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.INLINE_FORMULA),)),), warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.INFO, "Checker unavailable."),), ) summary = build_summary(document) assert summary["warning_count"] == 1 assert summary["math_render_error_count"] == 0