Add Markdown recheck command
This commit is contained in:
+29
-2
@@ -16,8 +16,9 @@ from pdf2md.mineru_adapter import MinerUAdapterResult
|
||||
|
||||
|
||||
class FakeAdapter:
|
||||
def __init__(self, *, succeeded: bool = True) -> None:
|
||||
def __init__(self, *, succeeded: bool = True, raw_markdown: str | None = None) -> None:
|
||||
self.succeeded = succeeded
|
||||
self.raw_markdown = raw_markdown
|
||||
self.calls: list[Path] = []
|
||||
self.options: list[object] = []
|
||||
|
||||
@@ -33,7 +34,7 @@ class FakeAdapter:
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=f"# {input_path.stem}\n" if self.succeeded else None,
|
||||
raw_markdown=(self.raw_markdown or f"# {input_path.stem}\n") if self.succeeded else None,
|
||||
raw_structured={"pages": 1},
|
||||
asset_paths=(),
|
||||
warnings=() if self.succeeded else (warning,),
|
||||
@@ -188,6 +189,32 @@ def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None:
|
||||
assert not (tmp_path / "out" / "paper.md").exists()
|
||||
|
||||
|
||||
def test_cli_recheck_markdown_regenerates_adjacent_metadata_and_report(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
out = tmp_path / "out"
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
|
||||
assert (
|
||||
main(
|
||||
["convert", str(pdf), "--out", str(out)],
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
math_checker=lambda _: False,
|
||||
)
|
||||
== 0
|
||||
)
|
||||
capsys.readouterr()
|
||||
|
||||
markdown_path = out / "paper.md"
|
||||
markdown_path.write_text("Inline $x_i$\n", encoding="utf-8")
|
||||
exit_code = main(["recheck", str(markdown_path)], clock=fixed_clock, math_checker=lambda _: True)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "rechecked:" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert "- Final status: `success`" in (out / "paper.report.md").read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
out = tmp_path / "out"
|
||||
|
||||
@@ -9,7 +9,7 @@ import pytest
|
||||
from pypdf import PdfWriter
|
||||
|
||||
import pdf2md.conversion as conversion_module
|
||||
from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf
|
||||
from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf, recheck_markdown
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
|
||||
from pdf2md.paths import OutputConflictError
|
||||
@@ -230,6 +230,33 @@ def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_pa
|
||||
assert "`MATH_RENDER_FAILED`" in report
|
||||
|
||||
|
||||
def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: False, clock=fixed_clock)
|
||||
|
||||
result.markdown_path.write_text("Inline $x_i$\n", encoding="utf-8")
|
||||
rechecked = recheck_markdown(result.markdown_path, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert rechecked.final_status == "success"
|
||||
assert rechecked.warning_count == 0
|
||||
assert rechecked.markdown_path == result.markdown_path
|
||||
assert rechecked.metadata_path == result.metadata_path
|
||||
assert rechecked.report_path == result.report_path
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
|
||||
assert metadata["created_at"] == "2026-05-08T00:00:00Z"
|
||||
assert metadata["summary"]["pages_processed"] == 1
|
||||
assert metadata["summary"]["inline_formula_count"] == 1
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["summary"]["warning_count"] == 0
|
||||
assert metadata["warnings"] == []
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Final status: `success`" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "- None" in report
|
||||
|
||||
|
||||
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
|
||||
|
||||
@@ -6,3 +6,4 @@ import pdf2md
|
||||
def test_package_imports() -> None:
|
||||
assert pdf2md.__version__ == "0.1.0"
|
||||
assert callable(pdf2md.convert_pdf)
|
||||
assert callable(pdf2md.recheck_markdown)
|
||||
|
||||
Reference in New Issue
Block a user