add pdftomd
This commit is contained in:
@@ -0,0 +1,159 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.markdown import normalize_markdown
|
||||
|
||||
|
||||
def test_inline_parentheses_math_becomes_obsidian_dollars() -> None:
|
||||
result = normalize_markdown(r"Area is \(x_i^2 + y^{2}\).")
|
||||
|
||||
assert result.markdown == r"Area is $x_i^2 + y^{2}$."
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_existing_dollar_math_and_currency_are_not_rewritten() -> None:
|
||||
source = r"Cost is $5 and $10, while math $x_i^2$ stays."
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
|
||||
|
||||
def test_display_bracket_math_gets_own_delimiter_lines_and_blank_lines() -> None:
|
||||
result = normalize_markdown("Before\n\\[\na_i^2 + b^2\n\\]\nAfter")
|
||||
|
||||
assert result.markdown == "Before\n\n$$\na_i^2 + b^2\n$$\n\nAfter"
|
||||
|
||||
|
||||
def test_display_environment_body_is_preserved_inside_delimiters() -> None:
|
||||
source = "\\[\\begin{align}\na_i &= b^2\n\\end{align}\\]"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == "$$\n\\begin{align}\na_i &= b^2\n\\end{align}\n$$"
|
||||
|
||||
|
||||
def test_existing_display_math_spacing_is_idempotent() -> None:
|
||||
source = "Before\n$$\nx_i^2\n$$\nAfter"
|
||||
|
||||
once = normalize_markdown(source).markdown
|
||||
twice = normalize_markdown(once).markdown
|
||||
|
||||
assert once == "Before\n\n$$\nx_i^2\n$$\n\nAfter"
|
||||
assert twice == once
|
||||
|
||||
|
||||
def test_underscores_carets_braces_and_backslashes_inside_math_are_preserved() -> None:
|
||||
source = r"\(\frac{x_i^{2}}{\alpha_beta}\)"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == r"$\frac{x_i^{2}}{\alpha_beta}$"
|
||||
|
||||
|
||||
def test_fenced_code_blocks_are_not_normalized() -> None:
|
||||
source = "Text\n```md\n\\(x_i\\)\n\\[y\\]\n\n```\n\\(z\\)"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == "Text\n```md\n\\(x_i\\)\n\\[y\\]\n\n```\n$z$"
|
||||
|
||||
|
||||
def test_inline_code_spans_are_not_normalized() -> None:
|
||||
source = r"Keep `\(x_i\)` and convert \(y_i\)."
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == r"Keep `\(x_i\)` and convert $y_i$."
|
||||
|
||||
|
||||
def test_normalization_is_idempotent_for_mixed_content(tmp_path: Path) -> None:
|
||||
(tmp_path / "assets").mkdir()
|
||||
(tmp_path / "assets" / "fig 1.png").write_bytes(b"image")
|
||||
source = "Before \\(x_i\\)\n\\[y^2\\]\n"
|
||||
|
||||
once = normalize_markdown(source, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
|
||||
twice = normalize_markdown(once.markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
|
||||
|
||||
assert twice.markdown == once.markdown
|
||||
assert twice.warnings == once.warnings
|
||||
|
||||
|
||||
def test_relative_asset_links_use_posix_paths_and_preserve_alt_text() -> None:
|
||||
result = normalize_markdown(r"")
|
||||
|
||||
assert result.markdown == ""
|
||||
assert result.asset_links == ("assets/fig 1.png",)
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_missing_asset_link_emits_warning_when_checking_is_enabled(tmp_path: Path) -> None:
|
||||
(tmp_path / "assets").mkdir()
|
||||
|
||||
result = normalize_markdown(
|
||||
"",
|
||||
markdown_dir=tmp_path,
|
||||
asset_root=tmp_path / "assets",
|
||||
check_assets=True,
|
||||
)
|
||||
|
||||
assert result.markdown == ""
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("source", "expected_link"),
|
||||
[
|
||||
(r"", "fig.png"),
|
||||
("", "outside.png"),
|
||||
],
|
||||
)
|
||||
def test_invalid_local_asset_links_are_rewritten_as_relative_with_warning(source: str, expected_link: str) -> None:
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown.endswith(f"({expected_link})")
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_remote_asset_link_is_warned_and_not_fetched_or_rewritten() -> None:
|
||||
source = ""
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert result.asset_links == ("https://example.test/fig.png",)
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_absolute_asset_under_markdown_dir_can_be_rewritten_relative_with_warning(tmp_path: Path) -> None:
|
||||
asset_dir = tmp_path / "assets"
|
||||
asset_dir.mkdir()
|
||||
asset = asset_dir / "fig.png"
|
||||
asset.write_bytes(b"image")
|
||||
|
||||
result = normalize_markdown(f"", markdown_dir=tmp_path, asset_root=asset_dir, check_assets=True)
|
||||
|
||||
assert result.markdown == ""
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_simple_pipe_table_is_preserved() -> None:
|
||||
source = "| A | B |\n|---|---|\n| \\(x\\) | y |"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_complex_html_table_is_preserved_with_fallback_warning() -> None:
|
||||
source = '<table><tr><td rowspan="2">\\(x_i\\)</td><td>y</td></tr></table>'
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.TABLE_FALLBACK]
|
||||
Reference in New Issue
Block a user