modify pdftomd

This commit is contained in:
김경종
2026-05-14 10:16:59 +09:00
parent 2232b51fc9
commit dc11880140
69 changed files with 7784 additions and 1150 deletions
+113
View File
@@ -8,6 +8,7 @@ from pdf2md.ir import (
BlockType,
DocumentRecord,
PageRecord,
TextFidelityRecord,
WarningCode,
WarningRecord,
WarningSeverity,
@@ -161,3 +162,115 @@ def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path:
report = render_report(metadata)
assert "- Chunk: 2/3, source pages: 21-40" in report
def test_report_includes_single_page_conversion_context(tmp_path: Path) -> None:
metadata = make_metadata(tmp_path)
metadata["engine_options"] = {
"strict_local": True,
"page_conversion": {
"mode": "single_page",
"mineru_input_page_count": 1,
"output_group_page_count": 20,
"failed_source_pages": [],
},
}
report = render_report(metadata)
assert "- Page conversion mode: single-page MinerU inputs, grouped output size: 20" in report
def test_report_includes_aggregate_output_folder_and_markdown_parts(tmp_path: Path) -> None:
metadata = make_metadata(tmp_path)
metadata["engine_options"] = {
"strict_local": True,
"output_folder": str(tmp_path / "out" / "paper"),
"parts": [
{
"index": 1,
"total": 2,
"source_page_start": 1,
"source_page_end": 20,
"markdown_path": str(tmp_path / "out" / "paper" / "paper_001.md"),
"status": "success",
"warning_count": 0,
},
{
"index": 2,
"total": 2,
"source_page_start": 21,
"source_page_end": 23,
"markdown_path": None,
"status": "failed",
"warning_count": 2,
"failed_source_pages": [22, 23],
},
],
}
report = render_report(metadata)
assert f"- Output folder: {tmp_path / 'out' / 'paper'}" in report
assert "paper_001.md (source pages 1-20, status success)" in report
assert "- Markdown part 2/2: unavailable (source pages 21-23, status failed)" in report
assert "- Failed source pages for part 2: 22, 23" in report
def test_report_includes_text_fidelity_section_when_metadata_has_diagnostics(tmp_path: Path) -> None:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
text_fidelity=(
TextFidelityRecord(
page_index=0,
source_page_number=3,
pypdf_text_available=True,
markdown_text_available=True,
pypdf_hangul_count=10,
markdown_hangul_count=7,
hangul_count_delta=-3,
hangul_count_ratio=0.7,
unexpected_cjk_count=2,
pypdf_hangul_spacing_anomaly_ratio=0.0,
markdown_hangul_spacing_anomaly_ratio=0.0,
text_similarity=0.61,
replacement_candidate=True,
comparison_status="checked",
),
TextFidelityRecord(
page_index=1,
source_page_number=4,
pypdf_text_available=True,
markdown_text_available=False,
pypdf_hangul_count=5,
markdown_hangul_count=0,
hangul_count_delta=-5,
hangul_count_ratio=0.0,
unexpected_cjk_count=0,
pypdf_hangul_spacing_anomaly_ratio=0.0,
markdown_hangul_spacing_anomaly_ratio=0.0,
text_similarity=0.0,
replacement_candidate=False,
comparison_status="page_mapping_uncertain",
),
),
)
metadata = build_metadata(
document=document,
source_sha256="0" * 64,
created_at="2026-05-11T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
)
report = render_report(metadata)
assert "## Text Fidelity" in report
assert "- Checked page count: 1" in report
assert "- Low-fidelity page count: 1" in report
assert "- Unexpected CJK count: 2" in report
assert "- Replacement candidate page count: 1" in report
assert "- Low-similarity pages: 0" in report
assert "- Unexpected-CJK pages: 0" in report
assert "- Uncertain page-mapping pages: 1" in report