Files
PDFToMD/tests/test_quality.py
T
김경종 7e985ae94a add files
2026-04-30 17:05:19 +09:00

167 lines
4.8 KiB
Python

from __future__ import annotations
from pathlib import Path
from pdftomd.models import Asset, AssetKind
from pdftomd.quality import (
validate_caption_reference_anchors,
validate_chunk_frontmatter,
validate_image_links,
validate_latex_environments,
validate_markdown_quality,
validate_math_delimiters,
validate_tables,
)
def messages(issues: list[object]) -> list[str]:
return [getattr(issue, "message") for issue in issues]
def test_math_delimiters_accept_inline_and_block_math() -> None:
markdown = "\n".join(
[
"Inline energy $E = mc^2$ is preserved.",
"",
"$$",
r"\int_0^1 x^2 dx",
"$$",
]
)
assert validate_math_delimiters(markdown) == []
def test_math_delimiters_report_actionable_unclosed_inline_math() -> None:
issues = validate_math_delimiters("The expression $E = mc^2 is missing a close.")
assert len(issues) == 1
assert "Unclosed inline math delimiter" in issues[0].message
assert issues[0].line == 1
assert "$" in issues[0].message
def test_math_delimiters_report_actionable_unclosed_block_math() -> None:
issues = validate_math_delimiters("Before\n$$\na^2 + b^2 = c^2\nAfter")
assert len(issues) == 1
assert "Unclosed block math delimiter" in issues[0].message
assert issues[0].line == 2
def test_latex_environment_pairs_accept_nested_matching_pairs() -> None:
markdown = r"""
$$
\begin{aligned}
a &= b \\
\begin{matrix}1 & 2\end{matrix}
\end{aligned}
$$
"""
assert validate_latex_environments(markdown) == []
def test_latex_environment_pairs_report_mismatch() -> None:
issues = validate_latex_environments(r"\begin{aligned} x \end{matrix}")
assert len(issues) == 1
assert "LaTeX environment mismatch" in issues[0].message
assert "aligned" in issues[0].message
assert "matrix" in issues[0].message
def test_image_links_validate_filesystem_and_modeled_assets(tmp_path: Path) -> None:
image_dir = tmp_path / "images"
image_dir.mkdir()
(image_dir / "paper_fig-1.png").write_bytes(b"png")
asset = Asset(
id="asset-001",
kind=AssetKind.FIGURE,
relative_path="images/paper_fig-1.png",
page=1,
)
markdown = "![Figure 1](images/paper_fig-1.png)\n![Figure 2](images/missing.png)"
issues = validate_image_links(markdown, base_dir=tmp_path, assets=[asset])
assert messages(issues) == [
"Image link target does not exist on disk and is not present in modeled assets: images/missing.png"
]
def test_simple_markdown_table_parseability() -> None:
markdown = "\n".join(
[
"| A | B |",
"| --- | --- |",
"| 1 | 2 |",
"| 3 | 4 |",
]
)
assert validate_tables(markdown) == []
def test_markdown_table_reports_row_width_mismatch() -> None:
issues = validate_tables("| A | B |\n| --- | --- |\n| 1 | 2 | 3 |")
assert len(issues) == 1
assert "Markdown table row has 3 cells; expected 2" in issues[0].message
def test_complex_table_can_be_represented_as_allowed_html_with_fallback() -> None:
markdown = "\n".join(
[
'<table id="tbl-1">',
"<tr><th rowspan=\"2\">Load</th><th>Value</th></tr>",
"<tr><td>42</td></tr>",
"</table>",
"![Table 1 fallback](images/table-1.png)",
]
)
assert validate_tables(markdown, allow_html_table_fallback=True) == []
def test_frontmatter_requires_chunk_context_fields() -> None:
markdown = "---\ndocument_slug: paper\nchunk_index: 1\n---\n# Paper"
issues = validate_chunk_frontmatter(markdown)
assert messages(issues) == [
"Chunk frontmatter is missing required field: title",
"Chunk frontmatter is missing required field: page_range",
]
def test_frontmatter_accepts_required_chunk_context_fields() -> None:
markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1-3\n---\n# Paper"
assert validate_chunk_frontmatter(markdown) == []
def test_caption_reference_anchor_shape_checks_known_reference_targets() -> None:
markdown = "\n".join(
[
'<a id="fig-1"></a>',
"![Figure 1](images/fig-1.png)",
"Figure 1. Diagram.",
"As shown in [Fig. 1](#fig-1) and [Table 2](#table-2).",
]
)
issues = validate_caption_reference_anchors(markdown)
assert messages(issues) == ["Reference link points to a missing anchor: #table-2"]
def test_combined_quality_gate_does_not_mutate_markdown() -> None:
markdown = "---\ntitle: Paper\ndocument_slug: paper\nchunk_index: 1\npage_range: 1\n---\n$E=mc^2$"
result = validate_markdown_quality(markdown)
assert result.markdown == markdown
assert result.ok
assert result.issues == ()