Files
PDFToMD/tests/test_report.py
T
2026-05-08 16:42:19 +09:00

164 lines
5.7 KiB
Python

from __future__ import annotations
from pathlib import Path
from pdf2md.ir import (
AssetRecord,
BlockRecord,
BlockType,
DocumentRecord,
PageRecord,
WarningCode,
WarningRecord,
WarningSeverity,
)
from pdf2md.metadata import build_metadata
from pdf2md.quality import QualityResult
from pdf2md.report import determine_final_status, pages_with_warnings, render_report
def make_metadata(tmp_path: Path, *, warnings: tuple[WarningRecord, ...] = ()) -> dict[str, object]:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(
PageRecord(
page_index=0,
blocks=(
BlockRecord(BlockType.INLINE_FORMULA, page_index=0),
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0),
),
),
PageRecord(page_index=1, blocks=(BlockRecord(BlockType.PARAGRAPH, page_index=1),)),
),
assets=(AssetRecord("paper.assets/fig.png", page_index=1),),
warnings=warnings,
)
return build_metadata(
document=document,
source_sha256="0" * 64,
created_at="2026-05-08T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
engine_options={"strict_local": True},
)
def test_final_status_success_partial_and_failed(tmp_path: Path) -> None:
success_metadata = make_metadata(tmp_path)
warning_metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
)
failed_metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
)
assert determine_final_status(success_metadata) == "success"
assert determine_final_status(warning_metadata) == "partial"
assert determine_final_status(success_metadata, QualityResult(missing_asset_link_count=1)) == "partial"
assert determine_final_status(failed_metadata) == "failed"
def test_pages_with_warnings_are_sorted_and_derived_from_metadata_and_quality(tmp_path: Path) -> None:
metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
)
quality = QualityResult(
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.WARNING, "Math failed.", page_index=0),)
)
assert pages_with_warnings(metadata, quality) == (0, 1)
def test_report_content_includes_required_sections_and_counts(tmp_path: Path) -> None:
metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
)
quality = QualityResult(
missing_asset_link_count=2,
invalid_asset_link_count=1,
math_render_error_count=3,
warnings=(WarningRecord(WarningCode.ASSET_LINK_MISSING, WarningSeverity.WARNING, "Missing asset."),),
)
report = render_report(
metadata,
quality=quality,
markdown_path=tmp_path / "paper.md",
metadata_path=tmp_path / "paper.metadata.json",
report_path=tmp_path / "paper.report.md",
)
assert "# PDF-to-Markdown Quality Report" in report
assert "- Final status: `partial`" in report
assert f"- Source PDF: {tmp_path / 'paper.pdf'}" in report
assert f"- Output Markdown: {tmp_path / 'paper.md'}" in report
assert "- Engine: MinerU" in report
assert "- Engine version: 3.1.0" in report
assert '- Engine options: `{"strict_local": true}`' in report
assert "- Pages processed: 2" in report
assert "- Warning count: 2" in report
assert "- Asset count: 1" in report
assert "- Missing asset link count: 2" in report
assert "- Invalid asset link count: 1" in report
assert "- Inline formula count: 1" in report
assert "- Display formula count: 1" in report
assert "- Math render error count: 3" in report
assert "- Page 1" in report
assert "`ASSET_LINK_MISSING`" in report
def test_report_omits_absent_optional_paths_and_does_not_write_files(tmp_path: Path) -> None:
metadata = make_metadata(tmp_path)
report_path = tmp_path / "paper.report.md"
report = render_report(metadata)
assert "Output Markdown:" not in report
assert "Metadata JSON:" not in report
assert "Report Markdown:" not in report
assert not report_path.exists()
def test_report_failed_status_comes_from_error_severity_warning(tmp_path: Path) -> None:
metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
)
report = render_report(metadata)
assert "- Final status: `failed`" in report
def test_report_uses_metadata_math_render_count_plus_quality_count(tmp_path: Path) -> None:
metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Metadata math failed."),),
)
quality = QualityResult(math_render_error_count=2)
report = render_report(metadata, quality=quality)
assert "- Math render error count: 3" in report
def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path: Path) -> None:
metadata = make_metadata(tmp_path)
metadata["engine_options"] = {
"strict_local": True,
"chunk": {
"chunk_index": 2,
"total_chunks": 3,
"source_page_start": 21,
"source_page_end": 40,
},
}
report = render_report(metadata)
assert "- Chunk: 2/3, source pages: 21-40" in report