add pdftomd
This commit is contained in:
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import (
|
||||
AssetRecord,
|
||||
BlockRecord,
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
)
|
||||
from pdf2md.metadata import MetadataInputError, build_metadata, build_summary
|
||||
|
||||
|
||||
def make_document(tmp_path: Path) -> DocumentRecord:
|
||||
page_zero = PageRecord(
|
||||
page_index=0,
|
||||
blocks=(
|
||||
BlockRecord(BlockType.HEADING, page_index=0),
|
||||
BlockRecord(BlockType.INLINE_FORMULA, page_index=0, confidence=0.98),
|
||||
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, bbox=(1.0, 2.0, 3.0, 4.0)),
|
||||
),
|
||||
)
|
||||
page_one = PageRecord(
|
||||
page_index=1,
|
||||
blocks=(
|
||||
BlockRecord(BlockType.PARAGRAPH, page_index=1),
|
||||
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=1),
|
||||
),
|
||||
)
|
||||
return DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(page_zero, page_one),
|
||||
assets=(AssetRecord("paper.assets/figure.png", page_index=1),),
|
||||
warnings=(
|
||||
WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Check reading order.", page_index=1),
|
||||
WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Math failed to render.", page_index=0),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def build_test_metadata(tmp_path: Path) -> dict[str, object]:
|
||||
return build_metadata(
|
||||
document=make_document(tmp_path),
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-07T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options={"strict_local": True},
|
||||
)
|
||||
|
||||
|
||||
def test_metadata_has_required_top_level_fields(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
assert set(metadata) == {
|
||||
"source_pdf",
|
||||
"source_sha256",
|
||||
"created_at",
|
||||
"engine",
|
||||
"engine_version",
|
||||
"engine_options",
|
||||
"pages",
|
||||
"assets",
|
||||
"warnings",
|
||||
"summary",
|
||||
}
|
||||
|
||||
|
||||
def test_metadata_summary_counts_from_records(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
assert metadata["summary"] == {
|
||||
"pages_processed": 2,
|
||||
"warning_count": 2,
|
||||
"asset_count": 1,
|
||||
"display_formula_count": 2,
|
||||
"inline_formula_count": 1,
|
||||
"math_render_error_count": 1,
|
||||
}
|
||||
|
||||
|
||||
def test_warning_order_and_page_provenance_are_preserved(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
warnings = metadata["warnings"]
|
||||
assert [warning["code"] for warning in warnings] == [
|
||||
"READING_ORDER_UNCERTAIN",
|
||||
"MATH_RENDER_FAILED",
|
||||
]
|
||||
assert warnings[0]["page_index"] == 1
|
||||
assert warnings[1]["page_index"] == 0
|
||||
|
||||
|
||||
def test_optional_bbox_and_confidence_are_preserved_only_when_present(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
blocks = metadata["pages"][0]["blocks"]
|
||||
|
||||
assert "confidence" not in blocks[0]
|
||||
assert blocks[1]["confidence"] == 0.98
|
||||
assert "bbox" not in blocks[1]
|
||||
assert blocks[2]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
||||
|
||||
|
||||
def test_metadata_is_json_serializable(tmp_path: Path) -> None:
|
||||
json.dumps(build_test_metadata(tmp_path))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("field_name", "kwargs"),
|
||||
[
|
||||
("document", {"document": None}),
|
||||
("source_sha256", {"source_sha256": ""}),
|
||||
("created_at", {"created_at": ""}),
|
||||
("engine", {"engine": ""}),
|
||||
("engine_version", {"engine_version": ""}),
|
||||
],
|
||||
)
|
||||
def test_metadata_requires_core_inputs(tmp_path: Path, field_name: str, kwargs: dict[str, object]) -> None:
|
||||
values: dict[str, object] = {
|
||||
"document": make_document(tmp_path),
|
||||
"source_sha256": "0" * 64,
|
||||
"created_at": "2026-05-07T00:00:00Z",
|
||||
"engine": "MinerU",
|
||||
"engine_version": "3.1.0",
|
||||
}
|
||||
values.update(kwargs)
|
||||
|
||||
with pytest.raises(MetadataInputError, match=field_name):
|
||||
build_metadata(**values)
|
||||
|
||||
|
||||
def test_engine_options_must_be_json_serializable(tmp_path: Path) -> None:
|
||||
with pytest.raises(MetadataInputError, match="JSON serializable"):
|
||||
build_metadata(
|
||||
document=make_document(tmp_path),
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-07T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options={"path": tmp_path},
|
||||
)
|
||||
|
||||
|
||||
def test_formula_counts_come_from_block_types_not_markdown_text(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH), BlockRecord(BlockType.UNKNOWN))),),
|
||||
)
|
||||
|
||||
summary = build_summary(document)
|
||||
|
||||
assert summary["inline_formula_count"] == 0
|
||||
assert summary["display_formula_count"] == 0
|
||||
|
||||
|
||||
def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.INLINE_FORMULA),)),),
|
||||
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.INFO, "Checker unavailable."),),
|
||||
)
|
||||
|
||||
summary = build_summary(document)
|
||||
|
||||
assert summary["warning_count"] == 1
|
||||
assert summary["math_render_error_count"] == 0
|
||||
Reference in New Issue
Block a user