Files
PDFToMD/tests/test_metadata.py
2026-05-14 10:16:59 +09:00

225 lines
7.2 KiB
Python

from __future__ import annotations
import json
from pathlib import Path
import pytest
from pdf2md.ir import (
AssetRecord,
BlockRecord,
BlockType,
DocumentRecord,
PageRecord,
TextFidelityRecord,
WarningCode,
WarningRecord,
WarningSeverity,
)
from pdf2md.metadata import MetadataInputError, build_metadata, build_summary
def make_document(tmp_path: Path) -> DocumentRecord:
page_zero = PageRecord(
page_index=0,
blocks=(
BlockRecord(BlockType.HEADING, page_index=0),
BlockRecord(BlockType.INLINE_FORMULA, page_index=0, confidence=0.98),
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, bbox=(1.0, 2.0, 3.0, 4.0)),
),
)
page_one = PageRecord(
page_index=1,
blocks=(
BlockRecord(BlockType.PARAGRAPH, page_index=1),
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=1),
),
)
return DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(page_zero, page_one),
assets=(AssetRecord("paper.assets/figure.png", page_index=1),),
warnings=(
WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Check reading order.", page_index=1),
WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Math failed to render.", page_index=0),
),
)
def build_test_metadata(tmp_path: Path) -> dict[str, object]:
return build_metadata(
document=make_document(tmp_path),
source_sha256="0" * 64,
created_at="2026-05-07T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
engine_options={"strict_local": True},
)
def test_metadata_has_required_top_level_fields(tmp_path: Path) -> None:
metadata = build_test_metadata(tmp_path)
assert set(metadata) == {
"source_pdf",
"source_sha256",
"created_at",
"engine",
"engine_version",
"engine_options",
"pages",
"assets",
"warnings",
"summary",
}
def test_metadata_summary_counts_from_records(tmp_path: Path) -> None:
metadata = build_test_metadata(tmp_path)
assert metadata["summary"] == {
"pages_processed": 2,
"warning_count": 2,
"asset_count": 1,
"display_formula_count": 2,
"inline_formula_count": 1,
"math_render_error_count": 1,
}
def test_warning_order_and_page_provenance_are_preserved(tmp_path: Path) -> None:
metadata = build_test_metadata(tmp_path)
warnings = metadata["warnings"]
assert [warning["code"] for warning in warnings] == [
"READING_ORDER_UNCERTAIN",
"MATH_RENDER_FAILED",
]
assert warnings[0]["page_index"] == 1
assert warnings[1]["page_index"] == 0
def test_optional_bbox_and_confidence_are_preserved_only_when_present(tmp_path: Path) -> None:
metadata = build_test_metadata(tmp_path)
blocks = metadata["pages"][0]["blocks"]
assert "confidence" not in blocks[0]
assert blocks[1]["confidence"] == 0.98
assert "bbox" not in blocks[1]
assert blocks[2]["bbox"] == [1.0, 2.0, 3.0, 4.0]
def test_metadata_is_json_serializable(tmp_path: Path) -> None:
json.dumps(build_test_metadata(tmp_path))
@pytest.mark.parametrize(
("field_name", "kwargs"),
[
("document", {"document": None}),
("source_sha256", {"source_sha256": ""}),
("created_at", {"created_at": ""}),
("engine", {"engine": ""}),
("engine_version", {"engine_version": ""}),
],
)
def test_metadata_requires_core_inputs(tmp_path: Path, field_name: str, kwargs: dict[str, object]) -> None:
values: dict[str, object] = {
"document": make_document(tmp_path),
"source_sha256": "0" * 64,
"created_at": "2026-05-07T00:00:00Z",
"engine": "MinerU",
"engine_version": "3.1.0",
}
values.update(kwargs)
with pytest.raises(MetadataInputError, match=field_name):
build_metadata(**values)
def test_engine_options_must_be_json_serializable(tmp_path: Path) -> None:
with pytest.raises(MetadataInputError, match="JSON serializable"):
build_metadata(
document=make_document(tmp_path),
source_sha256="0" * 64,
created_at="2026-05-07T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
engine_options={"path": tmp_path},
)
def test_formula_counts_come_from_block_types_not_markdown_text(tmp_path: Path) -> None:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH), BlockRecord(BlockType.UNKNOWN))),),
)
summary = build_summary(document)
assert summary["inline_formula_count"] == 0
assert summary["display_formula_count"] == 0
def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path) -> None:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.INLINE_FORMULA),)),),
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.INFO, "Checker unavailable."),),
)
summary = build_summary(document)
assert summary["warning_count"] == 1
assert summary["math_render_error_count"] == 0
def test_metadata_includes_text_fidelity_when_records_exist(tmp_path: Path) -> None:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
text_fidelity=(
TextFidelityRecord(
page_index=0,
source_page_number=1,
pypdf_text_available=True,
markdown_text_available=True,
pypdf_hangul_count=10,
markdown_hangul_count=8,
hangul_count_delta=-2,
hangul_count_ratio=0.8,
unexpected_cjk_count=1,
pypdf_hangul_spacing_anomaly_ratio=0.0,
markdown_hangul_spacing_anomaly_ratio=0.0,
text_similarity=0.72,
replacement_candidate=True,
comparison_status="checked",
),
),
warnings=(
WarningRecord(WarningCode.TEXT_FIDELITY_LOW, WarningSeverity.WARNING, "Low text fidelity.", page_index=0),
WarningRecord(
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
WarningSeverity.WARNING,
"Unexpected CJK.",
page_index=0,
),
),
)
metadata = build_metadata(
document=document,
source_sha256="0" * 64,
created_at="2026-05-11T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
)
assert metadata["text_fidelity"][0]["page_index"] == 0
assert metadata["text_fidelity"][0]["replacement_candidate"] is True
assert metadata["summary"]["text_fidelity_checked_page_count"] == 1
assert metadata["summary"]["text_fidelity_low_page_count"] == 1
assert metadata["summary"]["text_fidelity_unexpected_cjk_count"] == 1
assert metadata["summary"]["text_fidelity_replacement_candidate_page_count"] == 1
assert metadata["summary"]["text_fidelity_page_mapping_uncertain_count"] == 0