137 lines
4.7 KiB
Python
137 lines
4.7 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from pdf2md.ir import (
|
|
AssetRecord,
|
|
BlockRecord,
|
|
BlockType,
|
|
DocumentRecord,
|
|
PageRecord,
|
|
WarningCode,
|
|
WarningRecord,
|
|
WarningSeverity,
|
|
)
|
|
|
|
|
|
def test_record_serialization_preserves_present_optional_fields(tmp_path: Path) -> None:
|
|
block = BlockRecord(
|
|
BlockType.INLINE_FORMULA,
|
|
page_index=1,
|
|
bbox=(1.0, 2.0, 3.0, 4.0),
|
|
confidence=0.92,
|
|
markdown_span=(10, 20),
|
|
)
|
|
page = PageRecord(page_index=1, width=612, height=792, blocks=(block,))
|
|
asset = AssetRecord("paper.assets/image.png", page_index=1, bbox=(5.0, 6.0, 7.0, 8.0))
|
|
warning = WarningRecord(
|
|
WarningCode.LOW_CONFIDENCE_FORMULA,
|
|
WarningSeverity.WARNING,
|
|
"Formula confidence is low.",
|
|
page_index=1,
|
|
bbox=(1.0, 2.0, 3.0, 4.0),
|
|
)
|
|
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,), assets=(asset,), warnings=(warning,))
|
|
|
|
data = document.to_dict()
|
|
|
|
assert data["source_pdf"] == str(tmp_path / "paper.pdf")
|
|
assert data["pages"][0]["width"] == 612
|
|
assert data["pages"][0]["height"] == 792
|
|
assert data["pages"][0]["blocks"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
|
assert data["pages"][0]["blocks"][0]["confidence"] == 0.92
|
|
assert data["pages"][0]["blocks"][0]["markdown_span"] == [10, 20]
|
|
assert data["assets"][0]["bbox"] == [5.0, 6.0, 7.0, 8.0]
|
|
assert data["warnings"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
|
json.dumps(data)
|
|
|
|
|
|
def test_record_serialization_omits_absent_optional_fields(tmp_path: Path) -> None:
|
|
block = BlockRecord(BlockType.PARAGRAPH)
|
|
page = PageRecord(page_index=0, blocks=(block,))
|
|
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,))
|
|
|
|
block_data = document.to_dict()["pages"][0]["blocks"][0]
|
|
page_data = document.to_dict()["pages"][0]
|
|
|
|
assert "page_index" not in block_data
|
|
assert "bbox" not in block_data
|
|
assert "confidence" not in block_data
|
|
assert "markdown_span" not in block_data
|
|
assert "width" not in page_data
|
|
assert "height" not in page_data
|
|
|
|
|
|
def test_block_types_and_warning_codes_match_architecture_set() -> None:
|
|
assert {item.value for item in BlockType} == {
|
|
"heading",
|
|
"paragraph",
|
|
"inline_formula",
|
|
"display_formula",
|
|
"table",
|
|
"figure",
|
|
"caption",
|
|
"footnote",
|
|
"reference",
|
|
"unknown",
|
|
}
|
|
assert {item.value for item in WarningCode} >= {
|
|
"ENGINE_MISSING",
|
|
"GPU_UNAVAILABLE",
|
|
"LOW_CONFIDENCE_FORMULA",
|
|
"MATH_RENDER_FAILED",
|
|
"ASSET_LINK_MISSING",
|
|
"READING_ORDER_UNCERTAIN",
|
|
"STRICT_LOCAL_VIOLATION",
|
|
"MINERU_CLI_FAILED",
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize("invalid_block_type", ["formula", "image"])
|
|
def test_invalid_block_type_fails_predictably(invalid_block_type: str) -> None:
|
|
with pytest.raises(ValueError, match="invalid block_type"):
|
|
BlockRecord(invalid_block_type) # type: ignore[arg-type]
|
|
|
|
|
|
@pytest.mark.parametrize("invalid_code", ["REMOTE_API_USED", "UNKNOWN_WARNING"])
|
|
def test_invalid_warning_code_fails_predictably(invalid_code: str) -> None:
|
|
with pytest.raises(ValueError, match="invalid code"):
|
|
WarningRecord(invalid_code, WarningSeverity.WARNING, "message") # type: ignore[arg-type]
|
|
|
|
|
|
@pytest.mark.parametrize("invalid_severity", ["fatal", "warn"])
|
|
def test_invalid_warning_severity_fails_predictably(invalid_severity: str) -> None:
|
|
with pytest.raises(ValueError, match="invalid severity"):
|
|
WarningRecord(WarningCode.MATH_RENDER_FAILED, invalid_severity, "message") # type: ignore[arg-type]
|
|
|
|
|
|
def test_empty_pages_are_rejected(tmp_path: Path) -> None:
|
|
with pytest.raises(ValueError, match="at least one page"):
|
|
DocumentRecord(tmp_path / "paper.pdf", pages=())
|
|
|
|
|
|
def test_empty_source_pdf_is_rejected() -> None:
|
|
with pytest.raises(ValueError, match="source_pdf"):
|
|
DocumentRecord("", pages=(PageRecord(page_index=0),))
|
|
|
|
|
|
def test_invalid_optional_fields_are_rejected() -> None:
|
|
with pytest.raises(ValueError, match="page_index"):
|
|
BlockRecord(BlockType.PARAGRAPH, page_index=-1)
|
|
with pytest.raises(ValueError, match="bbox"):
|
|
BlockRecord(BlockType.PARAGRAPH, bbox=(1.0, 2.0, 3.0)) # type: ignore[arg-type]
|
|
with pytest.raises(ValueError, match="confidence"):
|
|
BlockRecord(BlockType.PARAGRAPH, confidence=1.2)
|
|
with pytest.raises(ValueError, match="markdown_span"):
|
|
BlockRecord(BlockType.PARAGRAPH, markdown_span=(5, 3))
|
|
|
|
|
|
def test_asset_paths_must_be_relative() -> None:
|
|
with pytest.raises(ValueError, match="relative"):
|
|
AssetRecord("/absolute/image.png")
|
|
with pytest.raises(ValueError, match="relative"):
|
|
AssetRecord("../outside.png")
|