Files
PDFToMD/tests/integration/test_v1_fast_release_gate.py
T
2026-05-14 10:16:59 +09:00

161 lines
6.4 KiB
Python

from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
from pdf2md.cli import main
from pdf2md.conversion import convert_pdf
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
from pdf2md.mineru_adapter import MinerUAdapterResult
class FixtureAdapter:
def __init__(
self,
*,
raw_markdown: str,
raw_structured: object | None = None,
succeeded: bool = True,
asset_name: str | None = None,
warnings: tuple[WarningRecord, ...] = (),
) -> None:
self.raw_markdown = raw_markdown
self.raw_structured = raw_structured
self.succeeded = succeeded
self.asset_name = asset_name
self.warnings = warnings
self.calls: list[tuple[Path, Path]] = []
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
input_path = Path(input_pdf)
output_dir = Path(work_dir)
output_dir.mkdir(parents=True, exist_ok=True)
self.calls.append((input_path, output_dir))
asset_paths: tuple[Path, ...] = ()
if self.asset_name is not None:
asset_path = output_dir / "assets" / self.asset_name
asset_path.parent.mkdir(parents=True, exist_ok=True)
asset_path.write_bytes(b"fake image")
asset_paths = (asset_path,)
failure = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
return MinerUAdapterResult(
succeeded=self.succeeded,
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
input_pdf=input_path,
work_dir=output_dir,
raw_markdown=self.raw_markdown if self.succeeded else None,
raw_structured=self.raw_structured,
asset_paths=asset_paths,
warnings=self.warnings if self.succeeded else (failure,),
engine="MinerU",
engine_version="3.1.0",
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
exit_code=0 if self.succeeded else 2,
stdout="",
stderr="",
)
def fixed_clock() -> datetime:
return datetime(2026, 5, 8, tzinfo=timezone.utc)
def make_pdf(directory: Path, name: str) -> Path:
path = directory / name
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(b"%PDF-1.7\nfast integration fixture\n")
return path
def report_metadata(result) -> dict:
assert result._report_metadata is not None
return result._report_metadata
def test_v1_fast_conversion_writes_markdown_report_assets_and_quality_counts(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path, "math.pdf")
adapter = FixtureAdapter(
raw_markdown=(
"# Shell Element\n\n"
"Inline \\(u_i\\) and display:\n\n"
"\\[\nK u = f\n\\]\n\n"
'<table><tr><td rowspan="2">\\(N_i\\)</td><td>stress</td></tr></table>\n\n'
"![mesh](assets/mesh.png)\n"
),
raw_structured={"pages": [{}, {}, {}]},
asset_name="mesh.png",
)
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
assert result.final_status == "partial"
assert result.markdown_path == tmp_path / "out" / "math" / "math_001.md"
assert result.markdown_path.exists()
assert result.metadata_path is None
assert not list((tmp_path / "out").rglob("*.metadata.json"))
assert result.report_path == tmp_path / "out" / "math" / "math_report.md"
assert result.report_path.exists()
assert result.assets_dir == tmp_path / "out" / "math" / "images"
assert (result.assets_dir / "mesh.png").read_bytes() == b"fake image"
markdown = result.markdown_path.read_text(encoding="utf-8")
assert "$u_i$" in markdown
assert "$$\nK u = f\n$$" in markdown
assert "![mesh](images/mesh.png)" in markdown
metadata = report_metadata(result)
assert metadata["engine"] == "MinerU"
assert metadata["engine_version"] == "3.1.0"
assert metadata["summary"]["pages_processed"] == 3
assert metadata["summary"]["asset_count"] == 1
assert metadata["summary"]["inline_formula_count"] == 1
assert metadata["summary"]["display_formula_count"] == 1
assert metadata["summary"]["math_render_error_count"] == 0
assert metadata["summary"]["warning_count"] == 1
assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK"
assert metadata["assets"] == [{"relative_path": "images/mesh.png"}]
report = result.report_path.read_text(encoding="utf-8")
assert "- Final status: `partial`" in report
assert "- Output Markdown:" in report
assert "- Metadata JSON:" not in report
assert "- Report Markdown:" in report
assert "- Math render error count: 0" in report
assert "`TABLE_FALLBACK`" in report
def test_v1_fast_failure_records_no_fallback_and_writes_report_only(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path, "failed.pdf")
adapter = FixtureAdapter(raw_markdown="", succeeded=False)
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
assert result.final_status == "failed"
assert result.warning_count == 1
assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED
assert not result.markdown_path.exists()
assert result.report_path.exists()
assert result.metadata_path is None
assert "- Final status: `failed`" in result.report_path.read_text(encoding="utf-8")
def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None:
source = tmp_path / "pdfs"
first = make_pdf(source, "a.pdf")
second = make_pdf(source, "korean.pdf")
adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1})
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
captured = capsys.readouterr()
assert exit_code == 0
assert [call[0] for call in adapter.calls] == [first.resolve(), second.resolve()]
assert "converted: 2" in captured.out
assert "failed: 0" in captured.out
assert "warnings: 0" in captured.out
assert (tmp_path / "out" / "a" / "a_001.md").exists()
assert (tmp_path / "out" / "a" / "a_report.md").exists()
assert (tmp_path / "out" / "korean" / "korean_001.md").exists()
assert (tmp_path / "out" / "korean" / "korean_report.md").exists()
assert not list((tmp_path / "out").rglob("*.metadata.json"))