modify pdftomd

This commit is contained in:
김경종
2026-05-14 10:16:59 +09:00
parent 2232b51fc9
commit dc11880140
69 changed files with 7784 additions and 1150 deletions
+51
View File
@@ -11,6 +11,7 @@ from pdf2md.ir import (
BlockType,
DocumentRecord,
PageRecord,
TextFidelityRecord,
WarningCode,
WarningRecord,
WarningSeverity,
@@ -171,3 +172,53 @@ def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path)
assert summary["warning_count"] == 1
assert summary["math_render_error_count"] == 0
def test_metadata_includes_text_fidelity_when_records_exist(tmp_path: Path) -> None:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
text_fidelity=(
TextFidelityRecord(
page_index=0,
source_page_number=1,
pypdf_text_available=True,
markdown_text_available=True,
pypdf_hangul_count=10,
markdown_hangul_count=8,
hangul_count_delta=-2,
hangul_count_ratio=0.8,
unexpected_cjk_count=1,
pypdf_hangul_spacing_anomaly_ratio=0.0,
markdown_hangul_spacing_anomaly_ratio=0.0,
text_similarity=0.72,
replacement_candidate=True,
comparison_status="checked",
),
),
warnings=(
WarningRecord(WarningCode.TEXT_FIDELITY_LOW, WarningSeverity.WARNING, "Low text fidelity.", page_index=0),
WarningRecord(
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
WarningSeverity.WARNING,
"Unexpected CJK.",
page_index=0,
),
),
)
metadata = build_metadata(
document=document,
source_sha256="0" * 64,
created_at="2026-05-11T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
)
assert metadata["text_fidelity"][0]["page_index"] == 0
assert metadata["text_fidelity"][0]["replacement_candidate"] is True
assert metadata["summary"]["text_fidelity_checked_page_count"] == 1
assert metadata["summary"]["text_fidelity_low_page_count"] == 1
assert metadata["summary"]["text_fidelity_unexpected_cjk_count"] == 1
assert metadata["summary"]["text_fidelity_replacement_candidate_page_count"] == 1
assert metadata["summary"]["text_fidelity_page_mapping_uncertain_count"] == 0