277 lines
9.6 KiB
Python
277 lines
9.6 KiB
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from pdf2md.ir import (
|
|
AssetRecord,
|
|
BlockRecord,
|
|
BlockType,
|
|
DocumentRecord,
|
|
PageRecord,
|
|
TextFidelityRecord,
|
|
WarningCode,
|
|
WarningRecord,
|
|
WarningSeverity,
|
|
)
|
|
from pdf2md.metadata import build_metadata
|
|
from pdf2md.quality import QualityResult
|
|
from pdf2md.report import determine_final_status, pages_with_warnings, render_report
|
|
|
|
|
|
def make_metadata(tmp_path: Path, *, warnings: tuple[WarningRecord, ...] = ()) -> dict[str, object]:
|
|
document = DocumentRecord(
|
|
source_pdf=tmp_path / "paper.pdf",
|
|
pages=(
|
|
PageRecord(
|
|
page_index=0,
|
|
blocks=(
|
|
BlockRecord(BlockType.INLINE_FORMULA, page_index=0),
|
|
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0),
|
|
),
|
|
),
|
|
PageRecord(page_index=1, blocks=(BlockRecord(BlockType.PARAGRAPH, page_index=1),)),
|
|
),
|
|
assets=(AssetRecord("paper.assets/fig.png", page_index=1),),
|
|
warnings=warnings,
|
|
)
|
|
return build_metadata(
|
|
document=document,
|
|
source_sha256="0" * 64,
|
|
created_at="2026-05-08T00:00:00Z",
|
|
engine="MinerU",
|
|
engine_version="3.1.0",
|
|
engine_options={"strict_local": True},
|
|
)
|
|
|
|
|
|
def test_final_status_success_partial_and_failed(tmp_path: Path) -> None:
|
|
success_metadata = make_metadata(tmp_path)
|
|
warning_metadata = make_metadata(
|
|
tmp_path,
|
|
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
|
)
|
|
failed_metadata = make_metadata(
|
|
tmp_path,
|
|
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
|
|
)
|
|
|
|
assert determine_final_status(success_metadata) == "success"
|
|
assert determine_final_status(warning_metadata) == "partial"
|
|
assert determine_final_status(success_metadata, QualityResult(missing_asset_link_count=1)) == "partial"
|
|
assert determine_final_status(failed_metadata) == "failed"
|
|
|
|
|
|
def test_pages_with_warnings_are_sorted_and_derived_from_metadata_and_quality(tmp_path: Path) -> None:
|
|
metadata = make_metadata(
|
|
tmp_path,
|
|
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
|
)
|
|
quality = QualityResult(
|
|
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.WARNING, "Math failed.", page_index=0),)
|
|
)
|
|
|
|
assert pages_with_warnings(metadata, quality) == (0, 1)
|
|
|
|
|
|
def test_report_content_includes_required_sections_and_counts(tmp_path: Path) -> None:
|
|
metadata = make_metadata(
|
|
tmp_path,
|
|
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
|
)
|
|
quality = QualityResult(
|
|
missing_asset_link_count=2,
|
|
invalid_asset_link_count=1,
|
|
math_render_error_count=3,
|
|
warnings=(WarningRecord(WarningCode.ASSET_LINK_MISSING, WarningSeverity.WARNING, "Missing asset."),),
|
|
)
|
|
|
|
report = render_report(
|
|
metadata,
|
|
quality=quality,
|
|
markdown_path=tmp_path / "paper.md",
|
|
metadata_path=tmp_path / "paper.metadata.json",
|
|
report_path=tmp_path / "paper.report.md",
|
|
)
|
|
|
|
assert "# PDF-to-Markdown Quality Report" in report
|
|
assert "- Final status: `partial`" in report
|
|
assert f"- Source PDF: {tmp_path / 'paper.pdf'}" in report
|
|
assert f"- Output Markdown: {tmp_path / 'paper.md'}" in report
|
|
assert "- Engine: MinerU" in report
|
|
assert "- Engine version: 3.1.0" in report
|
|
assert '- Engine options: `{"strict_local": true}`' in report
|
|
assert "- Pages processed: 2" in report
|
|
assert "- Warning count: 2" in report
|
|
assert "- Asset count: 1" in report
|
|
assert "- Missing asset link count: 2" in report
|
|
assert "- Invalid asset link count: 1" in report
|
|
assert "- Inline formula count: 1" in report
|
|
assert "- Display formula count: 1" in report
|
|
assert "- Math render error count: 3" in report
|
|
assert "- Page 1" in report
|
|
assert "`ASSET_LINK_MISSING`" in report
|
|
|
|
|
|
def test_report_omits_absent_optional_paths_and_does_not_write_files(tmp_path: Path) -> None:
|
|
metadata = make_metadata(tmp_path)
|
|
report_path = tmp_path / "paper.report.md"
|
|
|
|
report = render_report(metadata)
|
|
|
|
assert "Output Markdown:" not in report
|
|
assert "Metadata JSON:" not in report
|
|
assert "Report Markdown:" not in report
|
|
assert not report_path.exists()
|
|
|
|
|
|
def test_report_failed_status_comes_from_error_severity_warning(tmp_path: Path) -> None:
|
|
metadata = make_metadata(
|
|
tmp_path,
|
|
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
|
|
)
|
|
|
|
report = render_report(metadata)
|
|
|
|
assert "- Final status: `failed`" in report
|
|
|
|
|
|
def test_report_uses_metadata_math_render_count_plus_quality_count(tmp_path: Path) -> None:
|
|
metadata = make_metadata(
|
|
tmp_path,
|
|
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Metadata math failed."),),
|
|
)
|
|
quality = QualityResult(math_render_error_count=2)
|
|
|
|
report = render_report(metadata, quality=quality)
|
|
|
|
assert "- Math render error count: 3" in report
|
|
|
|
|
|
def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path: Path) -> None:
|
|
metadata = make_metadata(tmp_path)
|
|
metadata["engine_options"] = {
|
|
"strict_local": True,
|
|
"chunk": {
|
|
"chunk_index": 2,
|
|
"total_chunks": 3,
|
|
"source_page_start": 21,
|
|
"source_page_end": 40,
|
|
},
|
|
}
|
|
|
|
report = render_report(metadata)
|
|
|
|
assert "- Chunk: 2/3, source pages: 21-40" in report
|
|
|
|
|
|
def test_report_includes_single_page_conversion_context(tmp_path: Path) -> None:
|
|
metadata = make_metadata(tmp_path)
|
|
metadata["engine_options"] = {
|
|
"strict_local": True,
|
|
"page_conversion": {
|
|
"mode": "single_page",
|
|
"mineru_input_page_count": 1,
|
|
"output_group_page_count": 20,
|
|
"failed_source_pages": [],
|
|
},
|
|
}
|
|
|
|
report = render_report(metadata)
|
|
|
|
assert "- Page conversion mode: single-page MinerU inputs, grouped output size: 20" in report
|
|
|
|
|
|
def test_report_includes_aggregate_output_folder_and_markdown_parts(tmp_path: Path) -> None:
|
|
metadata = make_metadata(tmp_path)
|
|
metadata["engine_options"] = {
|
|
"strict_local": True,
|
|
"output_folder": str(tmp_path / "out" / "paper"),
|
|
"parts": [
|
|
{
|
|
"index": 1,
|
|
"total": 2,
|
|
"source_page_start": 1,
|
|
"source_page_end": 20,
|
|
"markdown_path": str(tmp_path / "out" / "paper" / "paper_001.md"),
|
|
"status": "success",
|
|
"warning_count": 0,
|
|
},
|
|
{
|
|
"index": 2,
|
|
"total": 2,
|
|
"source_page_start": 21,
|
|
"source_page_end": 23,
|
|
"markdown_path": None,
|
|
"status": "failed",
|
|
"warning_count": 2,
|
|
"failed_source_pages": [22, 23],
|
|
},
|
|
],
|
|
}
|
|
|
|
report = render_report(metadata)
|
|
|
|
assert f"- Output folder: {tmp_path / 'out' / 'paper'}" in report
|
|
assert "paper_001.md (source pages 1-20, status success)" in report
|
|
assert "- Markdown part 2/2: unavailable (source pages 21-23, status failed)" in report
|
|
assert "- Failed source pages for part 2: 22, 23" in report
|
|
|
|
|
|
def test_report_includes_text_fidelity_section_when_metadata_has_diagnostics(tmp_path: Path) -> None:
|
|
document = DocumentRecord(
|
|
source_pdf=tmp_path / "paper.pdf",
|
|
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
|
|
text_fidelity=(
|
|
TextFidelityRecord(
|
|
page_index=0,
|
|
source_page_number=3,
|
|
pypdf_text_available=True,
|
|
markdown_text_available=True,
|
|
pypdf_hangul_count=10,
|
|
markdown_hangul_count=7,
|
|
hangul_count_delta=-3,
|
|
hangul_count_ratio=0.7,
|
|
unexpected_cjk_count=2,
|
|
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
|
markdown_hangul_spacing_anomaly_ratio=0.0,
|
|
text_similarity=0.61,
|
|
replacement_candidate=True,
|
|
comparison_status="checked",
|
|
),
|
|
TextFidelityRecord(
|
|
page_index=1,
|
|
source_page_number=4,
|
|
pypdf_text_available=True,
|
|
markdown_text_available=False,
|
|
pypdf_hangul_count=5,
|
|
markdown_hangul_count=0,
|
|
hangul_count_delta=-5,
|
|
hangul_count_ratio=0.0,
|
|
unexpected_cjk_count=0,
|
|
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
|
markdown_hangul_spacing_anomaly_ratio=0.0,
|
|
text_similarity=0.0,
|
|
replacement_candidate=False,
|
|
comparison_status="page_mapping_uncertain",
|
|
),
|
|
),
|
|
)
|
|
metadata = build_metadata(
|
|
document=document,
|
|
source_sha256="0" * 64,
|
|
created_at="2026-05-11T00:00:00Z",
|
|
engine="MinerU",
|
|
engine_version="3.1.0",
|
|
)
|
|
|
|
report = render_report(metadata)
|
|
|
|
assert "## Text Fidelity" in report
|
|
assert "- Checked page count: 1" in report
|
|
assert "- Low-fidelity page count: 1" in report
|
|
assert "- Unexpected CJK count: 2" in report
|
|
assert "- Replacement candidate page count: 1" in report
|
|
assert "- Low-similarity pages: 0" in report
|
|
assert "- Unexpected-CJK pages: 0" in report
|
|
assert "- Uncertain page-mapping pages: 1" in report
|