from __future__ import annotations from pathlib import Path from pdf2md.ir import ( AssetRecord, BlockRecord, BlockType, DocumentRecord, PageRecord, TextFidelityRecord, WarningCode, WarningRecord, WarningSeverity, ) from pdf2md.metadata import build_metadata from pdf2md.quality import QualityResult from pdf2md.report import determine_final_status, pages_with_warnings, render_report def make_metadata(tmp_path: Path, *, warnings: tuple[WarningRecord, ...] = ()) -> dict[str, object]: document = DocumentRecord( source_pdf=tmp_path / "paper.pdf", pages=( PageRecord( page_index=0, blocks=( BlockRecord(BlockType.INLINE_FORMULA, page_index=0), BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0), ), ), PageRecord(page_index=1, blocks=(BlockRecord(BlockType.PARAGRAPH, page_index=1),)), ), assets=(AssetRecord("paper.assets/fig.png", page_index=1),), warnings=warnings, ) return build_metadata( document=document, source_sha256="0" * 64, created_at="2026-05-08T00:00:00Z", engine="MinerU", engine_version="3.1.0", engine_options={"strict_local": True}, ) def test_final_status_success_partial_and_failed(tmp_path: Path) -> None: success_metadata = make_metadata(tmp_path) warning_metadata = make_metadata( tmp_path, warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),), ) failed_metadata = make_metadata( tmp_path, warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),), ) assert determine_final_status(success_metadata) == "success" assert determine_final_status(warning_metadata) == "partial" assert determine_final_status(success_metadata, QualityResult(missing_asset_link_count=1)) == "partial" assert determine_final_status(failed_metadata) == "failed" def test_pages_with_warnings_are_sorted_and_derived_from_metadata_and_quality(tmp_path: Path) -> None: metadata = make_metadata( tmp_path, warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),), ) quality = QualityResult( warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.WARNING, "Math failed.", page_index=0),) ) assert pages_with_warnings(metadata, quality) == (0, 1) def test_report_content_includes_required_sections_and_counts(tmp_path: Path) -> None: metadata = make_metadata( tmp_path, warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),), ) quality = QualityResult( missing_asset_link_count=2, invalid_asset_link_count=1, math_render_error_count=3, warnings=(WarningRecord(WarningCode.ASSET_LINK_MISSING, WarningSeverity.WARNING, "Missing asset."),), ) report = render_report( metadata, quality=quality, markdown_path=tmp_path / "paper.md", metadata_path=tmp_path / "paper.metadata.json", report_path=tmp_path / "paper.report.md", ) assert "# PDF-to-Markdown Quality Report" in report assert "- Final status: `partial`" in report assert f"- Source PDF: {tmp_path / 'paper.pdf'}" in report assert f"- Output Markdown: {tmp_path / 'paper.md'}" in report assert "- Engine: MinerU" in report assert "- Engine version: 3.1.0" in report assert '- Engine options: `{"strict_local": true}`' in report assert "- Pages processed: 2" in report assert "- Warning count: 2" in report assert "- Asset count: 1" in report assert "- Missing asset link count: 2" in report assert "- Invalid asset link count: 1" in report assert "- Inline formula count: 1" in report assert "- Display formula count: 1" in report assert "- Math render error count: 3" in report assert "- Page 1" in report assert "`ASSET_LINK_MISSING`" in report def test_report_omits_absent_optional_paths_and_does_not_write_files(tmp_path: Path) -> None: metadata = make_metadata(tmp_path) report_path = tmp_path / "paper.report.md" report = render_report(metadata) assert "Output Markdown:" not in report assert "Metadata JSON:" not in report assert "Report Markdown:" not in report assert not report_path.exists() def test_report_failed_status_comes_from_error_severity_warning(tmp_path: Path) -> None: metadata = make_metadata( tmp_path, warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),), ) report = render_report(metadata) assert "- Final status: `failed`" in report def test_report_uses_metadata_math_render_count_plus_quality_count(tmp_path: Path) -> None: metadata = make_metadata( tmp_path, warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Metadata math failed."),), ) quality = QualityResult(math_render_error_count=2) report = render_report(metadata, quality=quality) assert "- Math render error count: 3" in report def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path: Path) -> None: metadata = make_metadata(tmp_path) metadata["engine_options"] = { "strict_local": True, "chunk": { "chunk_index": 2, "total_chunks": 3, "source_page_start": 21, "source_page_end": 40, }, } report = render_report(metadata) assert "- Chunk: 2/3, source pages: 21-40" in report def test_report_includes_single_page_conversion_context(tmp_path: Path) -> None: metadata = make_metadata(tmp_path) metadata["engine_options"] = { "strict_local": True, "page_conversion": { "mode": "single_page", "mineru_input_page_count": 1, "output_group_page_count": 20, "failed_source_pages": [], }, } report = render_report(metadata) assert "- Page conversion mode: single-page MinerU inputs, grouped output size: 20" in report def test_report_includes_aggregate_output_folder_and_markdown_parts(tmp_path: Path) -> None: metadata = make_metadata(tmp_path) metadata["engine_options"] = { "strict_local": True, "output_folder": str(tmp_path / "out" / "paper"), "parts": [ { "index": 1, "total": 2, "source_page_start": 1, "source_page_end": 20, "markdown_path": str(tmp_path / "out" / "paper" / "paper_001.md"), "status": "success", "warning_count": 0, }, { "index": 2, "total": 2, "source_page_start": 21, "source_page_end": 23, "markdown_path": None, "status": "failed", "warning_count": 2, "failed_source_pages": [22, 23], }, ], } report = render_report(metadata) assert f"- Output folder: {tmp_path / 'out' / 'paper'}" in report assert "paper_001.md (source pages 1-20, status success)" in report assert "- Markdown part 2/2: unavailable (source pages 21-23, status failed)" in report assert "- Failed source pages for part 2: 22, 23" in report def test_report_includes_text_fidelity_section_when_metadata_has_diagnostics(tmp_path: Path) -> None: document = DocumentRecord( source_pdf=tmp_path / "paper.pdf", pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),), text_fidelity=( TextFidelityRecord( page_index=0, source_page_number=3, pypdf_text_available=True, markdown_text_available=True, pypdf_hangul_count=10, markdown_hangul_count=7, hangul_count_delta=-3, hangul_count_ratio=0.7, unexpected_cjk_count=2, pypdf_hangul_spacing_anomaly_ratio=0.0, markdown_hangul_spacing_anomaly_ratio=0.0, text_similarity=0.61, replacement_candidate=True, comparison_status="checked", ), TextFidelityRecord( page_index=1, source_page_number=4, pypdf_text_available=True, markdown_text_available=False, pypdf_hangul_count=5, markdown_hangul_count=0, hangul_count_delta=-5, hangul_count_ratio=0.0, unexpected_cjk_count=0, pypdf_hangul_spacing_anomaly_ratio=0.0, markdown_hangul_spacing_anomaly_ratio=0.0, text_similarity=0.0, replacement_candidate=False, comparison_status="page_mapping_uncertain", ), ), ) metadata = build_metadata( document=document, source_sha256="0" * 64, created_at="2026-05-11T00:00:00Z", engine="MinerU", engine_version="3.1.0", ) report = render_report(metadata) assert "## Text Fidelity" in report assert "- Checked page count: 1" in report assert "- Low-fidelity page count: 1" in report assert "- Unexpected CJK count: 2" in report assert "- Replacement candidate page count: 1" in report assert "- Low-similarity pages: 0" in report assert "- Unexpected-CJK pages: 0" in report assert "- Uncertain page-mapping pages: 1" in report