167 lines
4.8 KiB
Python
167 lines
4.8 KiB
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from pdftomd.models import Asset, AssetKind
|
|
from pdftomd.quality import (
|
|
validate_caption_reference_anchors,
|
|
validate_chunk_frontmatter,
|
|
validate_image_links,
|
|
validate_latex_environments,
|
|
validate_markdown_quality,
|
|
validate_math_delimiters,
|
|
validate_tables,
|
|
)
|
|
|
|
|
|
def messages(issues: list[object]) -> list[str]:
|
|
return [getattr(issue, "message") for issue in issues]
|
|
|
|
|
|
def test_math_delimiters_accept_inline_and_block_math() -> None:
|
|
markdown = "\n".join(
|
|
[
|
|
"Inline energy $E = mc^2$ is preserved.",
|
|
"",
|
|
"$$",
|
|
r"\int_0^1 x^2 dx",
|
|
"$$",
|
|
]
|
|
)
|
|
|
|
assert validate_math_delimiters(markdown) == []
|
|
|
|
|
|
def test_math_delimiters_report_actionable_unclosed_inline_math() -> None:
|
|
issues = validate_math_delimiters("The expression $E = mc^2 is missing a close.")
|
|
|
|
assert len(issues) == 1
|
|
assert "Unclosed inline math delimiter" in issues[0].message
|
|
assert issues[0].line == 1
|
|
assert "$" in issues[0].message
|
|
|
|
|
|
def test_math_delimiters_report_actionable_unclosed_block_math() -> None:
|
|
issues = validate_math_delimiters("Before\n$$\na^2 + b^2 = c^2\nAfter")
|
|
|
|
assert len(issues) == 1
|
|
assert "Unclosed block math delimiter" in issues[0].message
|
|
assert issues[0].line == 2
|
|
|
|
|
|
def test_latex_environment_pairs_accept_nested_matching_pairs() -> None:
|
|
markdown = r"""
|
|
$$
|
|
\begin{aligned}
|
|
a &= b \\
|
|
\begin{matrix}1 & 2\end{matrix}
|
|
\end{aligned}
|
|
$$
|
|
"""
|
|
|
|
assert validate_latex_environments(markdown) == []
|
|
|
|
|
|
def test_latex_environment_pairs_report_mismatch() -> None:
|
|
issues = validate_latex_environments(r"\begin{aligned} x \end{matrix}")
|
|
|
|
assert len(issues) == 1
|
|
assert "LaTeX environment mismatch" in issues[0].message
|
|
assert "aligned" in issues[0].message
|
|
assert "matrix" in issues[0].message
|
|
|
|
|
|
def test_image_links_validate_filesystem_and_modeled_assets(tmp_path: Path) -> None:
|
|
image_dir = tmp_path / "images"
|
|
image_dir.mkdir()
|
|
(image_dir / "paper_fig-1.png").write_bytes(b"png")
|
|
asset = Asset(
|
|
id="asset-001",
|
|
kind=AssetKind.FIGURE,
|
|
relative_path="images/paper_fig-1.png",
|
|
page=1,
|
|
)
|
|
markdown = "\n"
|
|
|
|
issues = validate_image_links(markdown, base_dir=tmp_path, assets=[asset])
|
|
|
|
assert messages(issues) == [
|
|
"Image link target does not exist on disk and is not present in modeled assets: images/missing.png"
|
|
]
|
|
|
|
|
|
def test_simple_markdown_table_parseability() -> None:
|
|
markdown = "\n".join(
|
|
[
|
|
"| A | B |",
|
|
"| --- | --- |",
|
|
"| 1 | 2 |",
|
|
"| 3 | 4 |",
|
|
]
|
|
)
|
|
|
|
assert validate_tables(markdown) == []
|
|
|
|
|
|
def test_markdown_table_reports_row_width_mismatch() -> None:
|
|
issues = validate_tables("| A | B |\n| --- | --- |\n| 1 | 2 | 3 |")
|
|
|
|
assert len(issues) == 1
|
|
assert "Markdown table row has 3 cells; expected 2" in issues[0].message
|
|
|
|
|
|
def test_complex_table_can_be_represented_as_allowed_html_with_fallback() -> None:
|
|
markdown = "\n".join(
|
|
[
|
|
'<table id="tbl-1">',
|
|
"<tr><th rowspan=\"2\">Load</th><th>Value</th></tr>",
|
|
"<tr><td>42</td></tr>",
|
|
"</table>",
|
|
"",
|
|
]
|
|
)
|
|
|
|
assert validate_tables(markdown, allow_html_table_fallback=True) == []
|
|
|
|
|
|
def test_frontmatter_requires_chunk_context_fields() -> None:
|
|
markdown = "---\ndocument_slug: paper\nchunk_index: 1\n---\n# Paper"
|
|
|
|
issues = validate_chunk_frontmatter(markdown)
|
|
|
|
assert messages(issues) == [
|
|
"Chunk frontmatter is missing required field: title",
|
|
"Chunk frontmatter is missing required field: page_range",
|
|
]
|
|
|
|
|
|
def test_frontmatter_accepts_required_chunk_context_fields() -> None:
|
|
markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1-3\n---\n# Paper"
|
|
|
|
assert validate_chunk_frontmatter(markdown) == []
|
|
|
|
|
|
def test_caption_reference_anchor_shape_checks_known_reference_targets() -> None:
|
|
markdown = "\n".join(
|
|
[
|
|
'<a id="fig-1"></a>',
|
|
"",
|
|
"Figure 1. Diagram.",
|
|
"As shown in [Fig. 1](#fig-1) and [Table 2](#table-2).",
|
|
]
|
|
)
|
|
|
|
issues = validate_caption_reference_anchors(markdown)
|
|
|
|
assert messages(issues) == ["Reference link points to a missing anchor: #table-2"]
|
|
|
|
|
|
def test_combined_quality_gate_does_not_mutate_markdown() -> None:
|
|
markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1\n---\n$E=mc^2$"
|
|
|
|
result = validate_markdown_quality(markdown)
|
|
|
|
assert result.markdown == markdown
|
|
assert result.ok
|
|
assert result.issues == ()
|