feat: mitigate MathJax formula warnings

This commit is contained in:
NINI
2026-05-11 02:08:46 +09:00
parent 005f17bac1
commit 71e6fbcc51
12 changed files with 625 additions and 41 deletions
+41
View File
@@ -13,6 +13,7 @@ from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf,
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
from pdf2md.paths import OutputConflictError
from pdf2md.quality import MathCheckResult
class FakeAdapter:
@@ -230,6 +231,27 @@ def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_pa
assert "`MATH_RENDER_FAILED`" in report
def test_convert_pdf_repairs_math_render_failure_before_writing_outputs(tmp_path: Path) -> None:
class RepairAwareChecker:
def check_expressions(self, expressions):
return tuple(MathCheckResult(ok="{} ^ {t}" in expression.body) for expression in expressions)
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="\\[x ^ {i} ^ {t}\\]\n")
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=RepairAwareChecker(), clock=fixed_clock)
assert result.final_status == "partial"
assert result.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$"
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["summary"]["math_render_error_count"] == 0
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
report = result.report_path.read_text(encoding="utf-8")
assert "- Math render error count: 0" in report
assert "`MATH_RENDER_REPAIRED`" in report
def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
@@ -257,6 +279,25 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(
assert "- None" in report
def test_recheck_markdown_repairs_math_render_failure(tmp_path: Path) -> None:
class RepairAwareChecker:
def check_expressions(self, expressions):
return tuple(MathCheckResult(ok="{} ^ {t}" in expression.body) for expression in expressions)
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="No formulas.\n")
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
result.markdown_path.write_text("$$\nx ^ {i} ^ {t}\n$$\n", encoding="utf-8")
rechecked = recheck_markdown(result.markdown_path, math_checker=RepairAwareChecker(), clock=fixed_clock)
assert rechecked.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$\n"
assert [warning.code for warning in rechecked.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["summary"]["math_render_error_count"] == 0
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")