feat: mitigate MathJax formula warnings
This commit is contained in:
@@ -13,6 +13,7 @@ from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf,
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
|
||||
from pdf2md.paths import OutputConflictError
|
||||
from pdf2md.quality import MathCheckResult
|
||||
|
||||
|
||||
class FakeAdapter:
|
||||
@@ -230,6 +231,27 @@ def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_pa
|
||||
assert "`MATH_RENDER_FAILED`" in report
|
||||
|
||||
|
||||
def test_convert_pdf_repairs_math_render_failure_before_writing_outputs(tmp_path: Path) -> None:
|
||||
class RepairAwareChecker:
|
||||
def check_expressions(self, expressions):
|
||||
return tuple(MathCheckResult(ok="{} ^ {t}" in expression.body) for expression in expressions)
|
||||
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="\\[x ^ {i} ^ {t}\\]\n")
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=RepairAwareChecker(), clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "`MATH_RENDER_REPAIRED`" in report
|
||||
|
||||
|
||||
def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
|
||||
@@ -257,6 +279,25 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(
|
||||
assert "- None" in report
|
||||
|
||||
|
||||
def test_recheck_markdown_repairs_math_render_failure(tmp_path: Path) -> None:
|
||||
class RepairAwareChecker:
|
||||
def check_expressions(self, expressions):
|
||||
return tuple(MathCheckResult(ok="{} ^ {t}" in expression.body) for expression in expressions)
|
||||
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="No formulas.\n")
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
result.markdown_path.write_text("$$\nx ^ {i} ^ {t}\n$$\n", encoding="utf-8")
|
||||
|
||||
rechecked = recheck_markdown(result.markdown_path, math_checker=RepairAwareChecker(), clock=fixed_clock)
|
||||
|
||||
assert rechecked.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$\n"
|
||||
assert [warning.code for warning in rechecked.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
|
||||
|
||||
|
||||
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pdf2md.ir import WarningCode, WarningSeverity
|
||||
from pdf2md.math_repair import repair_math_render_failures
|
||||
from pdf2md.quality import MathCheckResult, MathRenderFailure, extract_math_expressions
|
||||
|
||||
|
||||
class BodyChecker:
|
||||
def __init__(self, passing_fragment: str) -> None:
|
||||
self.passing_fragment = passing_fragment
|
||||
self.checked_bodies: list[str] = []
|
||||
|
||||
def check_expressions(self, expressions):
|
||||
self.checked_bodies.extend(expression.body for expression in expressions)
|
||||
return tuple(MathCheckResult(ok=self.passing_fragment in expression.body) for expression in expressions)
|
||||
|
||||
|
||||
def test_repair_math_render_failures_disambiguates_repeated_superscripts() -> None:
|
||||
markdown = "$$\nx ^ {i} ^ {t}\n$$\n"
|
||||
expression = extract_math_expressions(markdown)[0]
|
||||
failure = MathRenderFailure(expression=expression, message="Double exponent: use braces to clarify")
|
||||
checker = BodyChecker("{} ^ {t}")
|
||||
|
||||
result = repair_math_render_failures(markdown, (failure,), checker)
|
||||
|
||||
assert result.markdown == "$$\nx ^ {i} {} ^ {t}\n$$\n"
|
||||
assert result.repairs[0].rule == "repeated_script"
|
||||
assert result.warnings[0].code == WarningCode.MATH_RENDER_REPAIRED
|
||||
assert result.warnings[0].severity == WarningSeverity.INFO
|
||||
|
||||
|
||||
def test_repair_math_render_failures_repairs_truncated_array_environment() -> None:
|
||||
markdown = "$$\n\\begin{array}{c} x \\end{a}\n$$\n"
|
||||
expression = extract_math_expressions(markdown)[0]
|
||||
failure = MathRenderFailure(expression=expression, message="Unknown environment 'a'")
|
||||
checker = BodyChecker("\\end{array}")
|
||||
|
||||
result = repair_math_render_failures(markdown, (failure,), checker)
|
||||
|
||||
assert result.markdown == "$$\n\\begin{array}{c} x \\end{array}\n$$\n"
|
||||
assert result.repairs[0].rule == "truncated_array_end"
|
||||
|
||||
|
||||
def test_repair_math_render_failures_leaves_markdown_unchanged_when_candidate_fails() -> None:
|
||||
markdown = "$$\nx ^ {i} ^ {t}\n$$\n"
|
||||
expression = extract_math_expressions(markdown)[0]
|
||||
failure = MathRenderFailure(expression=expression, message="Double exponent: use braces to clarify")
|
||||
checker = BodyChecker("never-passes")
|
||||
|
||||
result = repair_math_render_failures(markdown, (failure,), checker)
|
||||
|
||||
assert result.markdown == markdown
|
||||
assert result.repairs == ()
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_repair_math_render_failures_only_changes_failed_spans() -> None:
|
||||
markdown = "$a ^ {b} ^ {c}$ and $unchanged ^ {ok}$\n"
|
||||
expressions = extract_math_expressions(markdown)
|
||||
failure = MathRenderFailure(expression=expressions[0], message="Double exponent: use braces to clarify")
|
||||
checker = BodyChecker("{} ^ {c}")
|
||||
|
||||
result = repair_math_render_failures(markdown, (failure,), checker)
|
||||
|
||||
assert result.markdown == "$a ^ {b} {} ^ {c}$ and $unchanged ^ {ok}$\n"
|
||||
@@ -6,6 +6,7 @@ from pdf2md.ir import WarningCode, WarningSeverity
|
||||
from pdf2md.quality import (
|
||||
MathCheckerUnavailable,
|
||||
MathCheckResult,
|
||||
check_math_renderability_details,
|
||||
check_asset_links,
|
||||
check_math_renderability,
|
||||
extract_math_expressions,
|
||||
@@ -71,6 +72,20 @@ def test_math_render_failures_are_aggregated_with_fake_checker() -> None:
|
||||
assert "bad_math failed" in result.warnings[0].message
|
||||
|
||||
|
||||
def test_math_renderability_details_include_failed_expression_records() -> None:
|
||||
def checker(body: str) -> MathCheckResult:
|
||||
return MathCheckResult(ok="bad" not in body, message=f"{body} failed")
|
||||
|
||||
result = check_math_renderability_details("$x_i$\n\n$$\nbad_math\n$$", checker)
|
||||
|
||||
assert result.quality.math_render_error_count == 1
|
||||
assert len(result.failures) == 1
|
||||
assert result.failures[0].expression.index == 1
|
||||
assert result.failures[0].expression.body == "bad_math"
|
||||
assert result.failures[0].expression.display is True
|
||||
assert result.failures[0].message == "bad_math failed"
|
||||
|
||||
|
||||
def test_math_extraction_records_display_mode_and_markdown_spans() -> None:
|
||||
markdown = "Inline $x_i^2$ before\n\n$$\n\\frac{1}{2}\n$$\n"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user