Files
PDFToMD/tests/test_markdown.py
2026-05-08 16:42:19 +09:00

160 lines
5.2 KiB
Python

from __future__ import annotations
from pathlib import Path
import pytest
from pdf2md.ir import WarningCode
from pdf2md.markdown import normalize_markdown
def test_inline_parentheses_math_becomes_obsidian_dollars() -> None:
result = normalize_markdown(r"Area is \(x_i^2 + y^{2}\).")
assert result.markdown == r"Area is $x_i^2 + y^{2}$."
assert result.warnings == ()
def test_existing_dollar_math_and_currency_are_not_rewritten() -> None:
source = r"Cost is $5 and $10, while math $x_i^2$ stays."
result = normalize_markdown(source)
assert result.markdown == source
def test_display_bracket_math_gets_own_delimiter_lines_and_blank_lines() -> None:
result = normalize_markdown("Before\n\\[\na_i^2 + b^2\n\\]\nAfter")
assert result.markdown == "Before\n\n$$\na_i^2 + b^2\n$$\n\nAfter"
def test_display_environment_body_is_preserved_inside_delimiters() -> None:
source = "\\[\\begin{align}\na_i &= b^2\n\\end{align}\\]"
result = normalize_markdown(source)
assert result.markdown == "$$\n\\begin{align}\na_i &= b^2\n\\end{align}\n$$"
def test_existing_display_math_spacing_is_idempotent() -> None:
source = "Before\n$$\nx_i^2\n$$\nAfter"
once = normalize_markdown(source).markdown
twice = normalize_markdown(once).markdown
assert once == "Before\n\n$$\nx_i^2\n$$\n\nAfter"
assert twice == once
def test_underscores_carets_braces_and_backslashes_inside_math_are_preserved() -> None:
source = r"\(\frac{x_i^{2}}{\alpha_beta}\)"
result = normalize_markdown(source)
assert result.markdown == r"$\frac{x_i^{2}}{\alpha_beta}$"
def test_fenced_code_blocks_are_not_normalized() -> None:
source = "Text\n```md\n\\(x_i\\)\n\\[y\\]\n![alt](assets\\x.png)\n```\n\\(z\\)"
result = normalize_markdown(source)
assert result.markdown == "Text\n```md\n\\(x_i\\)\n\\[y\\]\n![alt](assets\\x.png)\n```\n$z$"
def test_inline_code_spans_are_not_normalized() -> None:
source = r"Keep `\(x_i\)` and convert \(y_i\)."
result = normalize_markdown(source)
assert result.markdown == r"Keep `\(x_i\)` and convert $y_i$."
def test_normalization_is_idempotent_for_mixed_content(tmp_path: Path) -> None:
(tmp_path / "assets").mkdir()
(tmp_path / "assets" / "fig 1.png").write_bytes(b"image")
source = "Before \\(x_i\\)\n\\[y^2\\]\n![fig](assets\\fig 1.png)"
once = normalize_markdown(source, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
twice = normalize_markdown(once.markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
assert twice.markdown == once.markdown
assert twice.warnings == once.warnings
def test_relative_asset_links_use_posix_paths_and_preserve_alt_text() -> None:
result = normalize_markdown(r"![한글 caption](assets\fig 1.png)")
assert result.markdown == "![한글 caption](assets/fig 1.png)"
assert result.asset_links == ("assets/fig 1.png",)
assert result.warnings == ()
def test_missing_asset_link_emits_warning_when_checking_is_enabled(tmp_path: Path) -> None:
(tmp_path / "assets").mkdir()
result = normalize_markdown(
"![missing](assets/missing.png)",
markdown_dir=tmp_path,
asset_root=tmp_path / "assets",
check_assets=True,
)
assert result.markdown == "![missing](assets/missing.png)"
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING]
@pytest.mark.parametrize(
("source", "expected_link"),
[
(r"![absolute](C:\tmp\fig.png)", "fig.png"),
("![escape](../outside.png)", "outside.png"),
],
)
def test_invalid_local_asset_links_are_rewritten_as_relative_with_warning(source: str, expected_link: str) -> None:
result = normalize_markdown(source)
assert result.markdown.endswith(f"({expected_link})")
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
def test_remote_asset_link_is_warned_and_not_fetched_or_rewritten() -> None:
source = "![remote](https://example.test/fig.png)"
result = normalize_markdown(source)
assert result.markdown == source
assert result.asset_links == ("https://example.test/fig.png",)
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
def test_absolute_asset_under_markdown_dir_can_be_rewritten_relative_with_warning(tmp_path: Path) -> None:
asset_dir = tmp_path / "assets"
asset_dir.mkdir()
asset = asset_dir / "fig.png"
asset.write_bytes(b"image")
result = normalize_markdown(f"![fig]({asset})", markdown_dir=tmp_path, asset_root=asset_dir, check_assets=True)
assert result.markdown == "![fig](assets/fig.png)"
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
def test_simple_pipe_table_is_preserved() -> None:
source = "| A | B |\n|---|---|\n| \\(x\\) | y |"
result = normalize_markdown(source)
assert result.markdown == source
assert result.warnings == ()
def test_complex_html_table_is_preserved_with_fallback_warning() -> None:
source = '<table><tr><td rowspan="2">\\(x_i\\)</td><td>y</td></tr></table>'
result = normalize_markdown(source)
assert result.markdown == source
assert [warning.code for warning in result.warnings] == [WarningCode.TABLE_FALLBACK]