modify pdftomd
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pypdf import PdfWriter
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.text_fidelity import (
|
||||
check_text_fidelity,
|
||||
compare_text_pages,
|
||||
count_hangul_syllables,
|
||||
count_unexpected_cjk,
|
||||
hangul_spacing_anomaly_ratio,
|
||||
strip_markdown_for_text_fidelity,
|
||||
)
|
||||
|
||||
|
||||
def test_text_metric_helpers_count_hangul_cjk_and_spacing() -> None:
|
||||
assert count_hangul_syllables("응 력 A 曲") == 2
|
||||
assert count_unexpected_cjk("응 력 A 曲") == 1
|
||||
assert hangul_spacing_anomaly_ratio("응 력") == 1.0
|
||||
assert hangul_spacing_anomaly_ratio("응력") == 0.0
|
||||
|
||||
|
||||
def test_markdown_stripping_ignores_math_assets_and_code() -> None:
|
||||
markdown = "\n".join(
|
||||
[
|
||||
"# 제목",
|
||||
"",
|
||||
"본문 $x_i$ 유지",
|
||||
"```",
|
||||
"코드 한글",
|
||||
"```",
|
||||
"`인라인 코드` 마지막",
|
||||
]
|
||||
)
|
||||
|
||||
stripped = strip_markdown_for_text_fidelity(markdown)
|
||||
|
||||
assert "제목" in stripped
|
||||
assert "본문" in stripped
|
||||
assert "마지막" in stripped
|
||||
assert "figure" not in stripped
|
||||
assert "x_i" not in stripped
|
||||
assert "코드 한글" not in stripped
|
||||
assert "인라인 코드" not in stripped
|
||||
|
||||
|
||||
def test_compare_text_pages_flags_low_hangul_fidelity_and_replacement_candidate() -> None:
|
||||
result = compare_text_pages(
|
||||
source_pages=("쉘의 응력과 곡률을 계산한다",),
|
||||
markdown_pages=("쉘의 력과 曲률을 계산한다",),
|
||||
source_page_start=6,
|
||||
)
|
||||
|
||||
page = result.pages[0]
|
||||
assert page.comparison_status == "checked"
|
||||
assert page.source_page_number == 6
|
||||
assert page.pypdf_hangul_count > page.markdown_hangul_count
|
||||
assert page.unexpected_cjk_count == 1
|
||||
assert page.replacement_candidate is True
|
||||
assert [warning.code for warning in result.warnings] == [
|
||||
WarningCode.TEXT_LAYER_AVAILABLE,
|
||||
WarningCode.TEXT_FIDELITY_LOW,
|
||||
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
|
||||
]
|
||||
|
||||
|
||||
def test_compare_text_pages_allows_markdown_hangul_count_above_source() -> None:
|
||||
result = compare_text_pages(
|
||||
source_pages=("응력",),
|
||||
markdown_pages=("응력 변형률",),
|
||||
)
|
||||
|
||||
assert result.pages[0].hangul_count_ratio == 2.5
|
||||
assert result.pages[0].replacement_candidate is False
|
||||
|
||||
|
||||
def test_check_text_fidelity_marks_uncertain_page_mapping_for_multi_page_markdown() -> None:
|
||||
result = check_text_fidelity(
|
||||
Path("paper.pdf"),
|
||||
"첫 페이지와 둘째 페이지를 합친 Markdown",
|
||||
page_count=2,
|
||||
source_text_pages=("첫 페이지", "둘째 페이지"),
|
||||
)
|
||||
|
||||
assert [page.comparison_status for page in result.pages] == [
|
||||
"page_mapping_uncertain",
|
||||
"page_mapping_uncertain",
|
||||
]
|
||||
assert [warning.code for warning in result.warnings] == [
|
||||
WarningCode.TEXT_LAYER_AVAILABLE,
|
||||
WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN,
|
||||
]
|
||||
|
||||
|
||||
def test_blank_generated_pdf_extraction_is_nonfatal(tmp_path: Path) -> None:
|
||||
pdf = tmp_path / "blank.pdf"
|
||||
writer = PdfWriter()
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with pdf.open("wb") as file:
|
||||
writer.write(file)
|
||||
|
||||
result = check_text_fidelity(pdf, "Markdown text", page_count=1)
|
||||
|
||||
assert result.pages[0].comparison_status == "source_text_missing"
|
||||
assert result.warnings == ()
|
||||
Reference in New Issue
Block a user