Files
PDFToMD/tests/test_ir.py
T
2026-05-08 16:42:19 +09:00

137 lines
4.7 KiB
Python

from __future__ import annotations
import json
from pathlib import Path
import pytest
from pdf2md.ir import (
AssetRecord,
BlockRecord,
BlockType,
DocumentRecord,
PageRecord,
WarningCode,
WarningRecord,
WarningSeverity,
)
def test_record_serialization_preserves_present_optional_fields(tmp_path: Path) -> None:
block = BlockRecord(
BlockType.INLINE_FORMULA,
page_index=1,
bbox=(1.0, 2.0, 3.0, 4.0),
confidence=0.92,
markdown_span=(10, 20),
)
page = PageRecord(page_index=1, width=612, height=792, blocks=(block,))
asset = AssetRecord("paper.assets/image.png", page_index=1, bbox=(5.0, 6.0, 7.0, 8.0))
warning = WarningRecord(
WarningCode.LOW_CONFIDENCE_FORMULA,
WarningSeverity.WARNING,
"Formula confidence is low.",
page_index=1,
bbox=(1.0, 2.0, 3.0, 4.0),
)
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,), assets=(asset,), warnings=(warning,))
data = document.to_dict()
assert data["source_pdf"] == str(tmp_path / "paper.pdf")
assert data["pages"][0]["width"] == 612
assert data["pages"][0]["height"] == 792
assert data["pages"][0]["blocks"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
assert data["pages"][0]["blocks"][0]["confidence"] == 0.92
assert data["pages"][0]["blocks"][0]["markdown_span"] == [10, 20]
assert data["assets"][0]["bbox"] == [5.0, 6.0, 7.0, 8.0]
assert data["warnings"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
json.dumps(data)
def test_record_serialization_omits_absent_optional_fields(tmp_path: Path) -> None:
block = BlockRecord(BlockType.PARAGRAPH)
page = PageRecord(page_index=0, blocks=(block,))
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,))
block_data = document.to_dict()["pages"][0]["blocks"][0]
page_data = document.to_dict()["pages"][0]
assert "page_index" not in block_data
assert "bbox" not in block_data
assert "confidence" not in block_data
assert "markdown_span" not in block_data
assert "width" not in page_data
assert "height" not in page_data
def test_block_types_and_warning_codes_match_architecture_set() -> None:
assert {item.value for item in BlockType} == {
"heading",
"paragraph",
"inline_formula",
"display_formula",
"table",
"figure",
"caption",
"footnote",
"reference",
"unknown",
}
assert {item.value for item in WarningCode} >= {
"ENGINE_MISSING",
"GPU_UNAVAILABLE",
"LOW_CONFIDENCE_FORMULA",
"MATH_RENDER_FAILED",
"ASSET_LINK_MISSING",
"READING_ORDER_UNCERTAIN",
"STRICT_LOCAL_VIOLATION",
"MINERU_CLI_FAILED",
}
@pytest.mark.parametrize("invalid_block_type", ["formula", "image"])
def test_invalid_block_type_fails_predictably(invalid_block_type: str) -> None:
with pytest.raises(ValueError, match="invalid block_type"):
BlockRecord(invalid_block_type) # type: ignore[arg-type]
@pytest.mark.parametrize("invalid_code", ["REMOTE_API_USED", "UNKNOWN_WARNING"])
def test_invalid_warning_code_fails_predictably(invalid_code: str) -> None:
with pytest.raises(ValueError, match="invalid code"):
WarningRecord(invalid_code, WarningSeverity.WARNING, "message") # type: ignore[arg-type]
@pytest.mark.parametrize("invalid_severity", ["fatal", "warn"])
def test_invalid_warning_severity_fails_predictably(invalid_severity: str) -> None:
with pytest.raises(ValueError, match="invalid severity"):
WarningRecord(WarningCode.MATH_RENDER_FAILED, invalid_severity, "message") # type: ignore[arg-type]
def test_empty_pages_are_rejected(tmp_path: Path) -> None:
with pytest.raises(ValueError, match="at least one page"):
DocumentRecord(tmp_path / "paper.pdf", pages=())
def test_empty_source_pdf_is_rejected() -> None:
with pytest.raises(ValueError, match="source_pdf"):
DocumentRecord("", pages=(PageRecord(page_index=0),))
def test_invalid_optional_fields_are_rejected() -> None:
with pytest.raises(ValueError, match="page_index"):
BlockRecord(BlockType.PARAGRAPH, page_index=-1)
with pytest.raises(ValueError, match="bbox"):
BlockRecord(BlockType.PARAGRAPH, bbox=(1.0, 2.0, 3.0)) # type: ignore[arg-type]
with pytest.raises(ValueError, match="confidence"):
BlockRecord(BlockType.PARAGRAPH, confidence=1.2)
with pytest.raises(ValueError, match="markdown_span"):
BlockRecord(BlockType.PARAGRAPH, markdown_span=(5, 3))
def test_asset_paths_must_be_relative() -> None:
with pytest.raises(ValueError, match="relative"):
AssetRecord("/absolute/image.png")
with pytest.raises(ValueError, match="relative"):
AssetRecord("../outside.png")