modify pdftomd
This commit is contained in:
@@ -8,6 +8,7 @@ from pdf2md.ir import (
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
TextFidelityRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
@@ -161,3 +162,115 @@ def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path:
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "- Chunk: 2/3, source pages: 21-40" in report
|
||||
|
||||
|
||||
def test_report_includes_single_page_conversion_context(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(tmp_path)
|
||||
metadata["engine_options"] = {
|
||||
"strict_local": True,
|
||||
"page_conversion": {
|
||||
"mode": "single_page",
|
||||
"mineru_input_page_count": 1,
|
||||
"output_group_page_count": 20,
|
||||
"failed_source_pages": [],
|
||||
},
|
||||
}
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "- Page conversion mode: single-page MinerU inputs, grouped output size: 20" in report
|
||||
|
||||
|
||||
def test_report_includes_aggregate_output_folder_and_markdown_parts(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(tmp_path)
|
||||
metadata["engine_options"] = {
|
||||
"strict_local": True,
|
||||
"output_folder": str(tmp_path / "out" / "paper"),
|
||||
"parts": [
|
||||
{
|
||||
"index": 1,
|
||||
"total": 2,
|
||||
"source_page_start": 1,
|
||||
"source_page_end": 20,
|
||||
"markdown_path": str(tmp_path / "out" / "paper" / "paper_001.md"),
|
||||
"status": "success",
|
||||
"warning_count": 0,
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"total": 2,
|
||||
"source_page_start": 21,
|
||||
"source_page_end": 23,
|
||||
"markdown_path": None,
|
||||
"status": "failed",
|
||||
"warning_count": 2,
|
||||
"failed_source_pages": [22, 23],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert f"- Output folder: {tmp_path / 'out' / 'paper'}" in report
|
||||
assert "paper_001.md (source pages 1-20, status success)" in report
|
||||
assert "- Markdown part 2/2: unavailable (source pages 21-23, status failed)" in report
|
||||
assert "- Failed source pages for part 2: 22, 23" in report
|
||||
|
||||
|
||||
def test_report_includes_text_fidelity_section_when_metadata_has_diagnostics(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
|
||||
text_fidelity=(
|
||||
TextFidelityRecord(
|
||||
page_index=0,
|
||||
source_page_number=3,
|
||||
pypdf_text_available=True,
|
||||
markdown_text_available=True,
|
||||
pypdf_hangul_count=10,
|
||||
markdown_hangul_count=7,
|
||||
hangul_count_delta=-3,
|
||||
hangul_count_ratio=0.7,
|
||||
unexpected_cjk_count=2,
|
||||
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
||||
markdown_hangul_spacing_anomaly_ratio=0.0,
|
||||
text_similarity=0.61,
|
||||
replacement_candidate=True,
|
||||
comparison_status="checked",
|
||||
),
|
||||
TextFidelityRecord(
|
||||
page_index=1,
|
||||
source_page_number=4,
|
||||
pypdf_text_available=True,
|
||||
markdown_text_available=False,
|
||||
pypdf_hangul_count=5,
|
||||
markdown_hangul_count=0,
|
||||
hangul_count_delta=-5,
|
||||
hangul_count_ratio=0.0,
|
||||
unexpected_cjk_count=0,
|
||||
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
||||
markdown_hangul_spacing_anomaly_ratio=0.0,
|
||||
text_similarity=0.0,
|
||||
replacement_candidate=False,
|
||||
comparison_status="page_mapping_uncertain",
|
||||
),
|
||||
),
|
||||
)
|
||||
metadata = build_metadata(
|
||||
document=document,
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-11T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
)
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "## Text Fidelity" in report
|
||||
assert "- Checked page count: 1" in report
|
||||
assert "- Low-fidelity page count: 1" in report
|
||||
assert "- Unexpected CJK count: 2" in report
|
||||
assert "- Replacement candidate page count: 1" in report
|
||||
assert "- Low-similarity pages: 0" in report
|
||||
assert "- Unexpected-CJK pages: 0" in report
|
||||
assert "- Uncertain page-mapping pages: 1" in report
|
||||
|
||||
Reference in New Issue
Block a user