from __future__ import annotations from pathlib import Path import pytest from pdf2md.ir import WarningCode from pdf2md.markdown import normalize_markdown def test_inline_parentheses_math_becomes_obsidian_dollars() -> None: result = normalize_markdown(r"Area is \(x_i^2 + y^{2}\).") assert result.markdown == r"Area is $x_i^2 + y^{2}$." assert result.warnings == () def test_existing_dollar_math_and_currency_are_not_rewritten() -> None: source = r"Cost is $5 and $10, while math $x_i^2$ stays." result = normalize_markdown(source) assert result.markdown == source def test_display_bracket_math_gets_own_delimiter_lines_and_blank_lines() -> None: result = normalize_markdown("Before\n\\[\na_i^2 + b^2\n\\]\nAfter") assert result.markdown == "Before\n\n$$\na_i^2 + b^2\n$$\n\nAfter" def test_display_environment_body_is_preserved_inside_delimiters() -> None: source = "\\[\\begin{align}\na_i &= b^2\n\\end{align}\\]" result = normalize_markdown(source) assert result.markdown == "$$\n\\begin{align}\na_i &= b^2\n\\end{align}\n$$" def test_existing_display_math_spacing_is_idempotent() -> None: source = "Before\n$$\nx_i^2\n$$\nAfter" once = normalize_markdown(source).markdown twice = normalize_markdown(once).markdown assert once == "Before\n\n$$\nx_i^2\n$$\n\nAfter" assert twice == once def test_underscores_carets_braces_and_backslashes_inside_math_are_preserved() -> None: source = r"\(\frac{x_i^{2}}{\alpha_beta}\)" result = normalize_markdown(source) assert result.markdown == r"$\frac{x_i^{2}}{\alpha_beta}$" def test_fenced_code_blocks_are_not_normalized() -> None: source = "Text\n```md\n\\(x_i\\)\n\\[y\\]\n\n```\n\\(z\\)" result = normalize_markdown(source) assert result.markdown == "Text\n```md\n\\(x_i\\)\n\\[y\\]\n\n```\n$z$" def test_inline_code_spans_are_not_normalized() -> None: source = r"Keep `\(x_i\)` and convert \(y_i\)." result = normalize_markdown(source) assert result.markdown == r"Keep `\(x_i\)` and convert $y_i$." def test_normalization_is_idempotent_for_mixed_content(tmp_path: Path) -> None: (tmp_path / "assets").mkdir() (tmp_path / "assets" / "fig 1.png").write_bytes(b"image") source = "Before \\(x_i\\)\n\\[y^2\\]\n" once = normalize_markdown(source, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True) twice = normalize_markdown(once.markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True) assert twice.markdown == once.markdown assert twice.warnings == once.warnings def test_relative_asset_links_use_posix_paths_and_preserve_alt_text() -> None: result = normalize_markdown(r"") assert result.markdown == "" assert result.asset_links == ("assets/fig 1.png",) assert result.warnings == () def test_missing_asset_link_emits_warning_when_checking_is_enabled(tmp_path: Path) -> None: (tmp_path / "assets").mkdir() result = normalize_markdown( "", markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True, ) assert result.markdown == "" assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING] @pytest.mark.parametrize( ("source", "expected_link"), [ (r"", "fig.png"), ("", "outside.png"), ], ) def test_invalid_local_asset_links_are_rewritten_as_relative_with_warning(source: str, expected_link: str) -> None: result = normalize_markdown(source) assert result.markdown.endswith(f"({expected_link})") assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID] def test_remote_asset_link_is_warned_and_not_fetched_or_rewritten() -> None: source = "" result = normalize_markdown(source) assert result.markdown == source assert result.asset_links == ("https://example.test/fig.png",) assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID] def test_absolute_asset_under_markdown_dir_can_be_rewritten_relative_with_warning(tmp_path: Path) -> None: asset_dir = tmp_path / "assets" asset_dir.mkdir() asset = asset_dir / "fig.png" asset.write_bytes(b"image") result = normalize_markdown(f"", markdown_dir=tmp_path, asset_root=asset_dir, check_assets=True) assert result.markdown == "" assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID] def test_simple_pipe_table_is_preserved() -> None: source = "| A | B |\n|---|---|\n| \\(x\\) | y |" result = normalize_markdown(source) assert result.markdown == source assert result.warnings == () def test_complex_html_table_is_preserved_with_fallback_warning() -> None: source = '
| \\(x_i\\) | y |