from __future__ import annotations import json from pathlib import Path import pytest from pdf2md.ir import ( AssetRecord, BlockRecord, BlockType, DocumentRecord, PageRecord, WarningCode, WarningRecord, WarningSeverity, ) def test_record_serialization_preserves_present_optional_fields(tmp_path: Path) -> None: block = BlockRecord( BlockType.INLINE_FORMULA, page_index=1, bbox=(1.0, 2.0, 3.0, 4.0), confidence=0.92, markdown_span=(10, 20), ) page = PageRecord(page_index=1, width=612, height=792, blocks=(block,)) asset = AssetRecord("paper.assets/image.png", page_index=1, bbox=(5.0, 6.0, 7.0, 8.0)) warning = WarningRecord( WarningCode.LOW_CONFIDENCE_FORMULA, WarningSeverity.WARNING, "Formula confidence is low.", page_index=1, bbox=(1.0, 2.0, 3.0, 4.0), ) document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,), assets=(asset,), warnings=(warning,)) data = document.to_dict() assert data["source_pdf"] == str(tmp_path / "paper.pdf") assert data["pages"][0]["width"] == 612 assert data["pages"][0]["height"] == 792 assert data["pages"][0]["blocks"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0] assert data["pages"][0]["blocks"][0]["confidence"] == 0.92 assert data["pages"][0]["blocks"][0]["markdown_span"] == [10, 20] assert data["assets"][0]["bbox"] == [5.0, 6.0, 7.0, 8.0] assert data["warnings"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0] json.dumps(data) def test_record_serialization_omits_absent_optional_fields(tmp_path: Path) -> None: block = BlockRecord(BlockType.PARAGRAPH) page = PageRecord(page_index=0, blocks=(block,)) document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,)) block_data = document.to_dict()["pages"][0]["blocks"][0] page_data = document.to_dict()["pages"][0] assert "page_index" not in block_data assert "bbox" not in block_data assert "confidence" not in block_data assert "markdown_span" not in block_data assert "width" not in page_data assert "height" not in page_data def test_block_types_and_warning_codes_match_architecture_set() -> None: assert {item.value for item in BlockType} == { "heading", "paragraph", "inline_formula", "display_formula", "table", "figure", "caption", "footnote", "reference", "unknown", } assert {item.value for item in WarningCode} >= { "ENGINE_MISSING", "GPU_UNAVAILABLE", "LOW_CONFIDENCE_FORMULA", "MATH_RENDER_FAILED", "ASSET_LINK_MISSING", "READING_ORDER_UNCERTAIN", "STRICT_LOCAL_VIOLATION", "MINERU_CLI_FAILED", } @pytest.mark.parametrize("invalid_block_type", ["formula", "image"]) def test_invalid_block_type_fails_predictably(invalid_block_type: str) -> None: with pytest.raises(ValueError, match="invalid block_type"): BlockRecord(invalid_block_type) # type: ignore[arg-type] @pytest.mark.parametrize("invalid_code", ["REMOTE_API_USED", "UNKNOWN_WARNING"]) def test_invalid_warning_code_fails_predictably(invalid_code: str) -> None: with pytest.raises(ValueError, match="invalid code"): WarningRecord(invalid_code, WarningSeverity.WARNING, "message") # type: ignore[arg-type] @pytest.mark.parametrize("invalid_severity", ["fatal", "warn"]) def test_invalid_warning_severity_fails_predictably(invalid_severity: str) -> None: with pytest.raises(ValueError, match="invalid severity"): WarningRecord(WarningCode.MATH_RENDER_FAILED, invalid_severity, "message") # type: ignore[arg-type] def test_empty_pages_are_rejected(tmp_path: Path) -> None: with pytest.raises(ValueError, match="at least one page"): DocumentRecord(tmp_path / "paper.pdf", pages=()) def test_empty_source_pdf_is_rejected() -> None: with pytest.raises(ValueError, match="source_pdf"): DocumentRecord("", pages=(PageRecord(page_index=0),)) def test_invalid_optional_fields_are_rejected() -> None: with pytest.raises(ValueError, match="page_index"): BlockRecord(BlockType.PARAGRAPH, page_index=-1) with pytest.raises(ValueError, match="bbox"): BlockRecord(BlockType.PARAGRAPH, bbox=(1.0, 2.0, 3.0)) # type: ignore[arg-type] with pytest.raises(ValueError, match="confidence"): BlockRecord(BlockType.PARAGRAPH, confidence=1.2) with pytest.raises(ValueError, match="markdown_span"): BlockRecord(BlockType.PARAGRAPH, markdown_span=(5, 3)) def test_asset_paths_must_be_relative() -> None: with pytest.raises(ValueError, match="relative"): AssetRecord("/absolute/image.png") with pytest.raises(ValueError, match="relative"): AssetRecord("../outside.png")