feat: mitigate MathJax formula warnings
This commit is contained in:
@@ -13,6 +13,7 @@ from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf,
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
|
||||
from pdf2md.paths import OutputConflictError
|
||||
from pdf2md.quality import MathCheckResult
|
||||
|
||||
|
||||
class FakeAdapter:
|
||||
@@ -230,6 +231,27 @@ def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_pa
|
||||
assert "`MATH_RENDER_FAILED`" in report
|
||||
|
||||
|
||||
def test_convert_pdf_repairs_math_render_failure_before_writing_outputs(tmp_path: Path) -> None:
|
||||
class RepairAwareChecker:
|
||||
def check_expressions(self, expressions):
|
||||
return tuple(MathCheckResult(ok="{} ^ {t}" in expression.body) for expression in expressions)
|
||||
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="\\[x ^ {i} ^ {t}\\]\n")
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=RepairAwareChecker(), clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "`MATH_RENDER_REPAIRED`" in report
|
||||
|
||||
|
||||
def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
|
||||
@@ -257,6 +279,25 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(
|
||||
assert "- None" in report
|
||||
|
||||
|
||||
def test_recheck_markdown_repairs_math_render_failure(tmp_path: Path) -> None:
|
||||
class RepairAwareChecker:
|
||||
def check_expressions(self, expressions):
|
||||
return tuple(MathCheckResult(ok="{} ^ {t}" in expression.body) for expression in expressions)
|
||||
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="No formulas.\n")
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
result.markdown_path.write_text("$$\nx ^ {i} ^ {t}\n$$\n", encoding="utf-8")
|
||||
|
||||
rechecked = recheck_markdown(result.markdown_path, math_checker=RepairAwareChecker(), clock=fixed_clock)
|
||||
|
||||
assert rechecked.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$\n"
|
||||
assert [warning.code for warning in rechecked.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
|
||||
|
||||
|
||||
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
|
||||
|
||||
Reference in New Issue
Block a user