feat: mitigate MathJax formula warnings

This commit is contained in:
NINI
2026-05-11 02:08:46 +09:00
parent 005f17bac1
commit 71e6fbcc51
12 changed files with 625 additions and 41 deletions
+41
View File
@@ -13,6 +13,7 @@ from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf,
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
from pdf2md.paths import OutputConflictError
from pdf2md.quality import MathCheckResult
class FakeAdapter:
@@ -230,6 +231,27 @@ def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_pa
assert "`MATH_RENDER_FAILED`" in report
def test_convert_pdf_repairs_math_render_failure_before_writing_outputs(tmp_path: Path) -> None:
class RepairAwareChecker:
def check_expressions(self, expressions):
return tuple(MathCheckResult(ok="{} ^ {t}" in expression.body) for expression in expressions)
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="\\[x ^ {i} ^ {t}\\]\n")
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=RepairAwareChecker(), clock=fixed_clock)
assert result.final_status == "partial"
assert result.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$"
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["summary"]["math_render_error_count"] == 0
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
report = result.report_path.read_text(encoding="utf-8")
assert "- Math render error count: 0" in report
assert "`MATH_RENDER_REPAIRED`" in report
def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
@@ -257,6 +279,25 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(
assert "- None" in report
def test_recheck_markdown_repairs_math_render_failure(tmp_path: Path) -> None:
class RepairAwareChecker:
def check_expressions(self, expressions):
return tuple(MathCheckResult(ok="{} ^ {t}" in expression.body) for expression in expressions)
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="No formulas.\n")
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
result.markdown_path.write_text("$$\nx ^ {i} ^ {t}\n$$\n", encoding="utf-8")
rechecked = recheck_markdown(result.markdown_path, math_checker=RepairAwareChecker(), clock=fixed_clock)
assert rechecked.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$\n"
assert [warning.code for warning in rechecked.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["summary"]["math_render_error_count"] == 0
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
+65
View File
@@ -0,0 +1,65 @@
from __future__ import annotations
from pdf2md.ir import WarningCode, WarningSeverity
from pdf2md.math_repair import repair_math_render_failures
from pdf2md.quality import MathCheckResult, MathRenderFailure, extract_math_expressions
class BodyChecker:
def __init__(self, passing_fragment: str) -> None:
self.passing_fragment = passing_fragment
self.checked_bodies: list[str] = []
def check_expressions(self, expressions):
self.checked_bodies.extend(expression.body for expression in expressions)
return tuple(MathCheckResult(ok=self.passing_fragment in expression.body) for expression in expressions)
def test_repair_math_render_failures_disambiguates_repeated_superscripts() -> None:
markdown = "$$\nx ^ {i} ^ {t}\n$$\n"
expression = extract_math_expressions(markdown)[0]
failure = MathRenderFailure(expression=expression, message="Double exponent: use braces to clarify")
checker = BodyChecker("{} ^ {t}")
result = repair_math_render_failures(markdown, (failure,), checker)
assert result.markdown == "$$\nx ^ {i} {} ^ {t}\n$$\n"
assert result.repairs[0].rule == "repeated_script"
assert result.warnings[0].code == WarningCode.MATH_RENDER_REPAIRED
assert result.warnings[0].severity == WarningSeverity.INFO
def test_repair_math_render_failures_repairs_truncated_array_environment() -> None:
markdown = "$$\n\\begin{array}{c} x \\end{a}\n$$\n"
expression = extract_math_expressions(markdown)[0]
failure = MathRenderFailure(expression=expression, message="Unknown environment 'a'")
checker = BodyChecker("\\end{array}")
result = repair_math_render_failures(markdown, (failure,), checker)
assert result.markdown == "$$\n\\begin{array}{c} x \\end{array}\n$$\n"
assert result.repairs[0].rule == "truncated_array_end"
def test_repair_math_render_failures_leaves_markdown_unchanged_when_candidate_fails() -> None:
markdown = "$$\nx ^ {i} ^ {t}\n$$\n"
expression = extract_math_expressions(markdown)[0]
failure = MathRenderFailure(expression=expression, message="Double exponent: use braces to clarify")
checker = BodyChecker("never-passes")
result = repair_math_render_failures(markdown, (failure,), checker)
assert result.markdown == markdown
assert result.repairs == ()
assert result.warnings == ()
def test_repair_math_render_failures_only_changes_failed_spans() -> None:
markdown = "$a ^ {b} ^ {c}$ and $unchanged ^ {ok}$\n"
expressions = extract_math_expressions(markdown)
failure = MathRenderFailure(expression=expressions[0], message="Double exponent: use braces to clarify")
checker = BodyChecker("{} ^ {c}")
result = repair_math_render_failures(markdown, (failure,), checker)
assert result.markdown == "$a ^ {b} {} ^ {c}$ and $unchanged ^ {ok}$\n"
+15
View File
@@ -6,6 +6,7 @@ from pdf2md.ir import WarningCode, WarningSeverity
from pdf2md.quality import (
MathCheckerUnavailable,
MathCheckResult,
check_math_renderability_details,
check_asset_links,
check_math_renderability,
extract_math_expressions,
@@ -71,6 +72,20 @@ def test_math_render_failures_are_aggregated_with_fake_checker() -> None:
assert "bad_math failed" in result.warnings[0].message
def test_math_renderability_details_include_failed_expression_records() -> None:
def checker(body: str) -> MathCheckResult:
return MathCheckResult(ok="bad" not in body, message=f"{body} failed")
result = check_math_renderability_details("$x_i$\n\n$$\nbad_math\n$$", checker)
assert result.quality.math_render_error_count == 1
assert len(result.failures) == 1
assert result.failures[0].expression.index == 1
assert result.failures[0].expression.body == "bad_math"
assert result.failures[0].expression.display is True
assert result.failures[0].message == "bad_math failed"
def test_math_extraction_records_display_mode_and_markdown_spans() -> None:
markdown = "Inline $x_i^2$ before\n\n$$\n\\frac{1}{2}\n$$\n"