modify pdftomd
This commit is contained in:
@@ -11,6 +11,7 @@ from pdf2md.ir import (
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
TextFidelityRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
@@ -171,3 +172,53 @@ def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path)
|
||||
|
||||
assert summary["warning_count"] == 1
|
||||
assert summary["math_render_error_count"] == 0
|
||||
|
||||
|
||||
def test_metadata_includes_text_fidelity_when_records_exist(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
|
||||
text_fidelity=(
|
||||
TextFidelityRecord(
|
||||
page_index=0,
|
||||
source_page_number=1,
|
||||
pypdf_text_available=True,
|
||||
markdown_text_available=True,
|
||||
pypdf_hangul_count=10,
|
||||
markdown_hangul_count=8,
|
||||
hangul_count_delta=-2,
|
||||
hangul_count_ratio=0.8,
|
||||
unexpected_cjk_count=1,
|
||||
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
||||
markdown_hangul_spacing_anomaly_ratio=0.0,
|
||||
text_similarity=0.72,
|
||||
replacement_candidate=True,
|
||||
comparison_status="checked",
|
||||
),
|
||||
),
|
||||
warnings=(
|
||||
WarningRecord(WarningCode.TEXT_FIDELITY_LOW, WarningSeverity.WARNING, "Low text fidelity.", page_index=0),
|
||||
WarningRecord(
|
||||
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
|
||||
WarningSeverity.WARNING,
|
||||
"Unexpected CJK.",
|
||||
page_index=0,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
metadata = build_metadata(
|
||||
document=document,
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-11T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
)
|
||||
|
||||
assert metadata["text_fidelity"][0]["page_index"] == 0
|
||||
assert metadata["text_fidelity"][0]["replacement_candidate"] is True
|
||||
assert metadata["summary"]["text_fidelity_checked_page_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_low_page_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_unexpected_cjk_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_replacement_candidate_page_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_page_mapping_uncertain_count"] == 0
|
||||
|
||||
Reference in New Issue
Block a user