108 lines
3.3 KiB
Python
108 lines
3.3 KiB
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from pypdf import PdfWriter
|
|
|
|
from pdf2md.ir import WarningCode
|
|
from pdf2md.text_fidelity import (
|
|
check_text_fidelity,
|
|
compare_text_pages,
|
|
count_hangul_syllables,
|
|
count_unexpected_cjk,
|
|
hangul_spacing_anomaly_ratio,
|
|
strip_markdown_for_text_fidelity,
|
|
)
|
|
|
|
|
|
def test_text_metric_helpers_count_hangul_cjk_and_spacing() -> None:
|
|
assert count_hangul_syllables("응 력 A 曲") == 2
|
|
assert count_unexpected_cjk("응 력 A 曲") == 1
|
|
assert hangul_spacing_anomaly_ratio("응 력") == 1.0
|
|
assert hangul_spacing_anomaly_ratio("응력") == 0.0
|
|
|
|
|
|
def test_markdown_stripping_ignores_math_assets_and_code() -> None:
|
|
markdown = "\n".join(
|
|
[
|
|
"# 제목",
|
|
"",
|
|
"본문 $x_i$ 유지",
|
|
"```",
|
|
"코드 한글",
|
|
"```",
|
|
"`인라인 코드` 마지막",
|
|
]
|
|
)
|
|
|
|
stripped = strip_markdown_for_text_fidelity(markdown)
|
|
|
|
assert "제목" in stripped
|
|
assert "본문" in stripped
|
|
assert "마지막" in stripped
|
|
assert "figure" not in stripped
|
|
assert "x_i" not in stripped
|
|
assert "코드 한글" not in stripped
|
|
assert "인라인 코드" not in stripped
|
|
|
|
|
|
def test_compare_text_pages_flags_low_hangul_fidelity_and_replacement_candidate() -> None:
|
|
result = compare_text_pages(
|
|
source_pages=("쉘의 응력과 곡률을 계산한다",),
|
|
markdown_pages=("쉘의 력과 曲률을 계산한다",),
|
|
source_page_start=6,
|
|
)
|
|
|
|
page = result.pages[0]
|
|
assert page.comparison_status == "checked"
|
|
assert page.source_page_number == 6
|
|
assert page.pypdf_hangul_count > page.markdown_hangul_count
|
|
assert page.unexpected_cjk_count == 1
|
|
assert page.replacement_candidate is True
|
|
assert [warning.code for warning in result.warnings] == [
|
|
WarningCode.TEXT_LAYER_AVAILABLE,
|
|
WarningCode.TEXT_FIDELITY_LOW,
|
|
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
|
|
]
|
|
|
|
|
|
def test_compare_text_pages_allows_markdown_hangul_count_above_source() -> None:
|
|
result = compare_text_pages(
|
|
source_pages=("응력",),
|
|
markdown_pages=("응력 변형률",),
|
|
)
|
|
|
|
assert result.pages[0].hangul_count_ratio == 2.5
|
|
assert result.pages[0].replacement_candidate is False
|
|
|
|
|
|
def test_check_text_fidelity_marks_uncertain_page_mapping_for_multi_page_markdown() -> None:
|
|
result = check_text_fidelity(
|
|
Path("paper.pdf"),
|
|
"첫 페이지와 둘째 페이지를 합친 Markdown",
|
|
page_count=2,
|
|
source_text_pages=("첫 페이지", "둘째 페이지"),
|
|
)
|
|
|
|
assert [page.comparison_status for page in result.pages] == [
|
|
"page_mapping_uncertain",
|
|
"page_mapping_uncertain",
|
|
]
|
|
assert [warning.code for warning in result.warnings] == [
|
|
WarningCode.TEXT_LAYER_AVAILABLE,
|
|
WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN,
|
|
]
|
|
|
|
|
|
def test_blank_generated_pdf_extraction_is_nonfatal(tmp_path: Path) -> None:
|
|
pdf = tmp_path / "blank.pdf"
|
|
writer = PdfWriter()
|
|
writer.add_blank_page(width=72, height=72)
|
|
with pdf.open("wb") as file:
|
|
writer.write(file)
|
|
|
|
result = check_text_fidelity(pdf, "Markdown text", page_count=1)
|
|
|
|
assert result.pages[0].comparison_status == "source_text_missing"
|
|
assert result.warnings == ()
|