225 lines
7.2 KiB
Python
225 lines
7.2 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from pdf2md.ir import (
|
|
AssetRecord,
|
|
BlockRecord,
|
|
BlockType,
|
|
DocumentRecord,
|
|
PageRecord,
|
|
TextFidelityRecord,
|
|
WarningCode,
|
|
WarningRecord,
|
|
WarningSeverity,
|
|
)
|
|
from pdf2md.metadata import MetadataInputError, build_metadata, build_summary
|
|
|
|
|
|
def make_document(tmp_path: Path) -> DocumentRecord:
|
|
page_zero = PageRecord(
|
|
page_index=0,
|
|
blocks=(
|
|
BlockRecord(BlockType.HEADING, page_index=0),
|
|
BlockRecord(BlockType.INLINE_FORMULA, page_index=0, confidence=0.98),
|
|
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, bbox=(1.0, 2.0, 3.0, 4.0)),
|
|
),
|
|
)
|
|
page_one = PageRecord(
|
|
page_index=1,
|
|
blocks=(
|
|
BlockRecord(BlockType.PARAGRAPH, page_index=1),
|
|
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=1),
|
|
),
|
|
)
|
|
return DocumentRecord(
|
|
source_pdf=tmp_path / "paper.pdf",
|
|
pages=(page_zero, page_one),
|
|
assets=(AssetRecord("paper.assets/figure.png", page_index=1),),
|
|
warnings=(
|
|
WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Check reading order.", page_index=1),
|
|
WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Math failed to render.", page_index=0),
|
|
),
|
|
)
|
|
|
|
|
|
def build_test_metadata(tmp_path: Path) -> dict[str, object]:
|
|
return build_metadata(
|
|
document=make_document(tmp_path),
|
|
source_sha256="0" * 64,
|
|
created_at="2026-05-07T00:00:00Z",
|
|
engine="MinerU",
|
|
engine_version="3.1.0",
|
|
engine_options={"strict_local": True},
|
|
)
|
|
|
|
|
|
def test_metadata_has_required_top_level_fields(tmp_path: Path) -> None:
|
|
metadata = build_test_metadata(tmp_path)
|
|
|
|
assert set(metadata) == {
|
|
"source_pdf",
|
|
"source_sha256",
|
|
"created_at",
|
|
"engine",
|
|
"engine_version",
|
|
"engine_options",
|
|
"pages",
|
|
"assets",
|
|
"warnings",
|
|
"summary",
|
|
}
|
|
|
|
|
|
def test_metadata_summary_counts_from_records(tmp_path: Path) -> None:
|
|
metadata = build_test_metadata(tmp_path)
|
|
|
|
assert metadata["summary"] == {
|
|
"pages_processed": 2,
|
|
"warning_count": 2,
|
|
"asset_count": 1,
|
|
"display_formula_count": 2,
|
|
"inline_formula_count": 1,
|
|
"math_render_error_count": 1,
|
|
}
|
|
|
|
|
|
def test_warning_order_and_page_provenance_are_preserved(tmp_path: Path) -> None:
|
|
metadata = build_test_metadata(tmp_path)
|
|
|
|
warnings = metadata["warnings"]
|
|
assert [warning["code"] for warning in warnings] == [
|
|
"READING_ORDER_UNCERTAIN",
|
|
"MATH_RENDER_FAILED",
|
|
]
|
|
assert warnings[0]["page_index"] == 1
|
|
assert warnings[1]["page_index"] == 0
|
|
|
|
|
|
def test_optional_bbox_and_confidence_are_preserved_only_when_present(tmp_path: Path) -> None:
|
|
metadata = build_test_metadata(tmp_path)
|
|
blocks = metadata["pages"][0]["blocks"]
|
|
|
|
assert "confidence" not in blocks[0]
|
|
assert blocks[1]["confidence"] == 0.98
|
|
assert "bbox" not in blocks[1]
|
|
assert blocks[2]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
|
|
|
|
|
def test_metadata_is_json_serializable(tmp_path: Path) -> None:
|
|
json.dumps(build_test_metadata(tmp_path))
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("field_name", "kwargs"),
|
|
[
|
|
("document", {"document": None}),
|
|
("source_sha256", {"source_sha256": ""}),
|
|
("created_at", {"created_at": ""}),
|
|
("engine", {"engine": ""}),
|
|
("engine_version", {"engine_version": ""}),
|
|
],
|
|
)
|
|
def test_metadata_requires_core_inputs(tmp_path: Path, field_name: str, kwargs: dict[str, object]) -> None:
|
|
values: dict[str, object] = {
|
|
"document": make_document(tmp_path),
|
|
"source_sha256": "0" * 64,
|
|
"created_at": "2026-05-07T00:00:00Z",
|
|
"engine": "MinerU",
|
|
"engine_version": "3.1.0",
|
|
}
|
|
values.update(kwargs)
|
|
|
|
with pytest.raises(MetadataInputError, match=field_name):
|
|
build_metadata(**values)
|
|
|
|
|
|
def test_engine_options_must_be_json_serializable(tmp_path: Path) -> None:
|
|
with pytest.raises(MetadataInputError, match="JSON serializable"):
|
|
build_metadata(
|
|
document=make_document(tmp_path),
|
|
source_sha256="0" * 64,
|
|
created_at="2026-05-07T00:00:00Z",
|
|
engine="MinerU",
|
|
engine_version="3.1.0",
|
|
engine_options={"path": tmp_path},
|
|
)
|
|
|
|
|
|
def test_formula_counts_come_from_block_types_not_markdown_text(tmp_path: Path) -> None:
|
|
document = DocumentRecord(
|
|
source_pdf=tmp_path / "paper.pdf",
|
|
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH), BlockRecord(BlockType.UNKNOWN))),),
|
|
)
|
|
|
|
summary = build_summary(document)
|
|
|
|
assert summary["inline_formula_count"] == 0
|
|
assert summary["display_formula_count"] == 0
|
|
|
|
|
|
def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path) -> None:
|
|
document = DocumentRecord(
|
|
source_pdf=tmp_path / "paper.pdf",
|
|
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.INLINE_FORMULA),)),),
|
|
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.INFO, "Checker unavailable."),),
|
|
)
|
|
|
|
summary = build_summary(document)
|
|
|
|
assert summary["warning_count"] == 1
|
|
assert summary["math_render_error_count"] == 0
|
|
|
|
|
|
def test_metadata_includes_text_fidelity_when_records_exist(tmp_path: Path) -> None:
|
|
document = DocumentRecord(
|
|
source_pdf=tmp_path / "paper.pdf",
|
|
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
|
|
text_fidelity=(
|
|
TextFidelityRecord(
|
|
page_index=0,
|
|
source_page_number=1,
|
|
pypdf_text_available=True,
|
|
markdown_text_available=True,
|
|
pypdf_hangul_count=10,
|
|
markdown_hangul_count=8,
|
|
hangul_count_delta=-2,
|
|
hangul_count_ratio=0.8,
|
|
unexpected_cjk_count=1,
|
|
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
|
markdown_hangul_spacing_anomaly_ratio=0.0,
|
|
text_similarity=0.72,
|
|
replacement_candidate=True,
|
|
comparison_status="checked",
|
|
),
|
|
),
|
|
warnings=(
|
|
WarningRecord(WarningCode.TEXT_FIDELITY_LOW, WarningSeverity.WARNING, "Low text fidelity.", page_index=0),
|
|
WarningRecord(
|
|
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
|
|
WarningSeverity.WARNING,
|
|
"Unexpected CJK.",
|
|
page_index=0,
|
|
),
|
|
),
|
|
)
|
|
|
|
metadata = build_metadata(
|
|
document=document,
|
|
source_sha256="0" * 64,
|
|
created_at="2026-05-11T00:00:00Z",
|
|
engine="MinerU",
|
|
engine_version="3.1.0",
|
|
)
|
|
|
|
assert metadata["text_fidelity"][0]["page_index"] == 0
|
|
assert metadata["text_fidelity"][0]["replacement_candidate"] is True
|
|
assert metadata["summary"]["text_fidelity_checked_page_count"] == 1
|
|
assert metadata["summary"]["text_fidelity_low_page_count"] == 1
|
|
assert metadata["summary"]["text_fidelity_unexpected_cjk_count"] == 1
|
|
assert metadata["summary"]["text_fidelity_replacement_candidate_page_count"] == 1
|
|
assert metadata["summary"]["text_fidelity_page_mapping_uncertain_count"] == 0
|