modify pdftomd

This commit is contained in:
김경종
2026-05-14 10:16:59 +09:00
parent 2232b51fc9
commit dc11880140
69 changed files with 7784 additions and 1150 deletions
+107
View File
@@ -0,0 +1,107 @@
from __future__ import annotations
from pathlib import Path
from pypdf import PdfWriter
from pdf2md.ir import WarningCode
from pdf2md.text_fidelity import (
check_text_fidelity,
compare_text_pages,
count_hangul_syllables,
count_unexpected_cjk,
hangul_spacing_anomaly_ratio,
strip_markdown_for_text_fidelity,
)
def test_text_metric_helpers_count_hangul_cjk_and_spacing() -> None:
assert count_hangul_syllables("응 력 A 曲") == 2
assert count_unexpected_cjk("응 력 A 曲") == 1
assert hangul_spacing_anomaly_ratio("응 력") == 1.0
assert hangul_spacing_anomaly_ratio("응력") == 0.0
def test_markdown_stripping_ignores_math_assets_and_code() -> None:
markdown = "\n".join(
[
"# 제목",
"![figure](paper.assets/fig.png)",
"본문 $x_i$ 유지",
"```",
"코드 한글",
"```",
"`인라인 코드` 마지막",
]
)
stripped = strip_markdown_for_text_fidelity(markdown)
assert "제목" in stripped
assert "본문" in stripped
assert "마지막" in stripped
assert "figure" not in stripped
assert "x_i" not in stripped
assert "코드 한글" not in stripped
assert "인라인 코드" not in stripped
def test_compare_text_pages_flags_low_hangul_fidelity_and_replacement_candidate() -> None:
result = compare_text_pages(
source_pages=("쉘의 응력과 곡률을 계산한다",),
markdown_pages=("쉘의 력과 曲률을 계산한다",),
source_page_start=6,
)
page = result.pages[0]
assert page.comparison_status == "checked"
assert page.source_page_number == 6
assert page.pypdf_hangul_count > page.markdown_hangul_count
assert page.unexpected_cjk_count == 1
assert page.replacement_candidate is True
assert [warning.code for warning in result.warnings] == [
WarningCode.TEXT_LAYER_AVAILABLE,
WarningCode.TEXT_FIDELITY_LOW,
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
]
def test_compare_text_pages_allows_markdown_hangul_count_above_source() -> None:
result = compare_text_pages(
source_pages=("응력",),
markdown_pages=("응력 변형률",),
)
assert result.pages[0].hangul_count_ratio == 2.5
assert result.pages[0].replacement_candidate is False
def test_check_text_fidelity_marks_uncertain_page_mapping_for_multi_page_markdown() -> None:
result = check_text_fidelity(
Path("paper.pdf"),
"첫 페이지와 둘째 페이지를 합친 Markdown",
page_count=2,
source_text_pages=("첫 페이지", "둘째 페이지"),
)
assert [page.comparison_status for page in result.pages] == [
"page_mapping_uncertain",
"page_mapping_uncertain",
]
assert [warning.code for warning in result.warnings] == [
WarningCode.TEXT_LAYER_AVAILABLE,
WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN,
]
def test_blank_generated_pdf_extraction_is_nonfatal(tmp_path: Path) -> None:
pdf = tmp_path / "blank.pdf"
writer = PdfWriter()
writer.add_blank_page(width=72, height=72)
with pdf.open("wb") as file:
writer.write(file)
result = check_text_fidelity(pdf, "Markdown text", page_count=1)
assert result.pages[0].comparison_status == "source_text_missing"
assert result.warnings == ()