from __future__ import annotations import json from datetime import datetime, timezone from pathlib import Path from pdf2md.cli import main from pdf2md.conversion import convert_pdf from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity from pdf2md.mineru_adapter import MinerUAdapterResult class FixtureAdapter: def __init__( self, *, raw_markdown: str, raw_structured: object | None = None, succeeded: bool = True, asset_name: str | None = None, warnings: tuple[WarningRecord, ...] = (), ) -> None: self.raw_markdown = raw_markdown self.raw_structured = raw_structured self.succeeded = succeeded self.asset_name = asset_name self.warnings = warnings self.calls: list[tuple[Path, Path]] = [] def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult: input_path = Path(input_pdf) output_dir = Path(work_dir) output_dir.mkdir(parents=True, exist_ok=True) self.calls.append((input_path, output_dir)) asset_paths: tuple[Path, ...] = () if self.asset_name is not None: asset_path = output_dir / "assets" / self.asset_name asset_path.parent.mkdir(parents=True, exist_ok=True) asset_path.write_bytes(b"fake image") asset_paths = (asset_path,) failure = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.") return MinerUAdapterResult( succeeded=self.succeeded, command=("mineru", "-p", str(input_path), "-o", str(output_dir)), input_pdf=input_path, work_dir=output_dir, raw_markdown=self.raw_markdown if self.succeeded else None, raw_structured=self.raw_structured, asset_paths=asset_paths, warnings=self.warnings if self.succeeded else (failure,), engine="MinerU", engine_version="3.1.0", engine_options=options.to_engine_options() if options is not None else {"strict_local": True}, exit_code=0 if self.succeeded else 2, stdout="", stderr="", ) def fixed_clock() -> datetime: return datetime(2026, 5, 8, tzinfo=timezone.utc) def make_pdf(directory: Path, name: str) -> Path: path = directory / name path.parent.mkdir(parents=True, exist_ok=True) path.write_bytes(b"%PDF-1.7\nfast integration fixture\n") return path def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_counts(tmp_path: Path) -> None: pdf = make_pdf(tmp_path, "쉘구조_math.pdf") adapter = FixtureAdapter( raw_markdown=( "# Shell Element\n\n" "Inline \\(u_i\\) and display:\n\n" "\\[\nK u = f\n\\]\n\n" '
\\(N_i\\)stress
\n\n' "![mesh](assets/mesh.png)\n" ), raw_structured={"pages": [{}, {}, {}]}, asset_name="mesh.png", ) result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock) assert result.final_status == "partial" assert result.markdown_path.exists() assert result.metadata_path is not None and result.metadata_path.exists() assert result.report_path.exists() assert (tmp_path / "out" / "쉘구조_math.assets" / "mesh.png").read_bytes() == b"fake image" markdown = result.markdown_path.read_text(encoding="utf-8") assert "$u_i$" in markdown assert "$$\nK u = f\n$$" in markdown assert "![mesh](쉘구조_math.assets/mesh.png)" in markdown metadata = json.loads(result.metadata_path.read_text(encoding="utf-8")) assert metadata["engine"] == "MinerU" assert metadata["engine_version"] == "3.1.0" assert metadata["summary"]["pages_processed"] == 3 assert metadata["summary"]["asset_count"] == 1 assert metadata["summary"]["inline_formula_count"] == 1 assert metadata["summary"]["display_formula_count"] == 1 assert metadata["summary"]["math_render_error_count"] == 0 assert metadata["summary"]["warning_count"] == 1 assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK" assert metadata["assets"] == [{"relative_path": "쉘구조_math.assets/mesh.png"}] report = result.report_path.read_text(encoding="utf-8") assert "- Final status: `partial`" in report assert "- Output Markdown:" in report assert "- Metadata JSON:" in report assert "- Report Markdown:" in report assert "- Math render error count: 0" in report assert "`TABLE_FALLBACK`" in report def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_path: Path) -> None: pdf = make_pdf(tmp_path, "failed.pdf") adapter = FixtureAdapter(raw_markdown="", succeeded=False) result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock) assert result.final_status == "failed" assert result.warning_count == 1 assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED assert not result.markdown_path.exists() assert not result.report_path.exists() assert result.metadata_path is not None and not result.metadata_path.exists() def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None: source = tmp_path / "pdfs" first = make_pdf(source, "a.pdf") second = make_pdf(source, "한글.pdf") adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1}) exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock) captured = capsys.readouterr() assert exit_code == 0 assert [call[0] for call in adapter.calls] == [first.resolve(), second.resolve()] assert "converted: 2" in captured.out assert "failed: 0" in captured.out assert "warnings: 0" in captured.out assert (tmp_path / "out" / "a.md").exists() assert (tmp_path / "out" / "a.metadata.json").exists() assert (tmp_path / "out" / "a.report.md").exists() assert (tmp_path / "out" / "한글.md").exists() assert (tmp_path / "out" / "한글.metadata.json").exists() assert (tmp_path / "out" / "한글.report.md").exists()