from __future__ import annotations from pathlib import Path from pypdf import PdfWriter from pdf2md.ir import WarningCode from pdf2md.text_fidelity import ( check_text_fidelity, compare_text_pages, count_hangul_syllables, count_unexpected_cjk, hangul_spacing_anomaly_ratio, strip_markdown_for_text_fidelity, ) def test_text_metric_helpers_count_hangul_cjk_and_spacing() -> None: assert count_hangul_syllables("응 력 A 曲") == 2 assert count_unexpected_cjk("응 력 A 曲") == 1 assert hangul_spacing_anomaly_ratio("응 력") == 1.0 assert hangul_spacing_anomaly_ratio("응력") == 0.0 def test_markdown_stripping_ignores_math_assets_and_code() -> None: markdown = "\n".join( [ "# 제목", "![figure](paper.assets/fig.png)", "본문 $x_i$ 유지", "```", "코드 한글", "```", "`인라인 코드` 마지막", ] ) stripped = strip_markdown_for_text_fidelity(markdown) assert "제목" in stripped assert "본문" in stripped assert "마지막" in stripped assert "figure" not in stripped assert "x_i" not in stripped assert "코드 한글" not in stripped assert "인라인 코드" not in stripped def test_compare_text_pages_flags_low_hangul_fidelity_and_replacement_candidate() -> None: result = compare_text_pages( source_pages=("쉘의 응력과 곡률을 계산한다",), markdown_pages=("쉘의 력과 曲률을 계산한다",), source_page_start=6, ) page = result.pages[0] assert page.comparison_status == "checked" assert page.source_page_number == 6 assert page.pypdf_hangul_count > page.markdown_hangul_count assert page.unexpected_cjk_count == 1 assert page.replacement_candidate is True assert [warning.code for warning in result.warnings] == [ WarningCode.TEXT_LAYER_AVAILABLE, WarningCode.TEXT_FIDELITY_LOW, WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT, ] def test_compare_text_pages_allows_markdown_hangul_count_above_source() -> None: result = compare_text_pages( source_pages=("응력",), markdown_pages=("응력 변형률",), ) assert result.pages[0].hangul_count_ratio == 2.5 assert result.pages[0].replacement_candidate is False def test_check_text_fidelity_marks_uncertain_page_mapping_for_multi_page_markdown() -> None: result = check_text_fidelity( Path("paper.pdf"), "첫 페이지와 둘째 페이지를 합친 Markdown", page_count=2, source_text_pages=("첫 페이지", "둘째 페이지"), ) assert [page.comparison_status for page in result.pages] == [ "page_mapping_uncertain", "page_mapping_uncertain", ] assert [warning.code for warning in result.warnings] == [ WarningCode.TEXT_LAYER_AVAILABLE, WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN, ] def test_blank_generated_pdf_extraction_is_nonfatal(tmp_path: Path) -> None: pdf = tmp_path / "blank.pdf" writer = PdfWriter() writer.add_blank_page(width=72, height=72) with pdf.open("wb") as file: writer.write(file) result = check_text_fidelity(pdf, "Markdown text", page_count=1) assert result.pages[0].comparison_status == "source_text_missing" assert result.warnings == ()