from __future__ import annotations import subprocess import sys import hashlib import json from datetime import datetime, timezone from importlib.metadata import entry_points from pathlib import Path import pytest from pypdf import PdfWriter import pdf2md.conversion as conversion_module from pdf2md.cli import main from pdf2md.doctor import DoctorCheck, DoctorReport from pdf2md.gpu import GpuInfo from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity from pdf2md.mineru_adapter import MinerUAdapterResult class FakeAdapter: def __init__(self, *, succeeded: bool = True, raw_markdown: str | None = None) -> None: self.succeeded = succeeded self.raw_markdown = raw_markdown self.calls: list[Path] = [] self.options: list[object] = [] def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult: input_path = Path(input_pdf) output_dir = Path(work_dir) output_dir.mkdir(parents=True, exist_ok=True) self.calls.append(input_path) self.options.append(options) warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.") return MinerUAdapterResult( succeeded=self.succeeded, command=("mineru", "-p", str(input_path), "-o", str(output_dir)), input_pdf=input_path, work_dir=output_dir, raw_markdown=(self.raw_markdown or f"# {input_path.stem}\n") if self.succeeded else None, raw_structured={"pages": 1}, asset_paths=(), warnings=() if self.succeeded else (warning,), engine="MinerU", engine_version="3.1.0", engine_options=options.to_engine_options() if options is not None else {"strict_local": True}, exit_code=0 if self.succeeded else 2, stdout="", stderr="", ) def fixed_clock() -> datetime: return datetime(2026, 5, 8, tzinfo=timezone.utc) def make_pdf(directory: Path, name: str) -> Path: path = directory / name path.parent.mkdir(parents=True, exist_ok=True) path.write_bytes(b"%PDF-1.7\n") return path def make_pdf_with_pages(directory: Path, name: str, page_count: int) -> Path: path = directory / name path.parent.mkdir(parents=True, exist_ok=True) writer = PdfWriter() for _ in range(page_count): writer.add_blank_page(width=72, height=72) with path.open("wb") as file: writer.write(file) return path def write_legacy_metadata(markdown_path: Path, source_pdf: Path) -> Path: metadata_path = markdown_path.with_suffix(".metadata.json") metadata = { "source_pdf": str(source_pdf.resolve()), "source_sha256": hashlib.sha256(source_pdf.read_bytes()).hexdigest(), "created_at": "2026-05-08T00:00:00Z", "engine": "MinerU", "engine_version": "3.1.0", "engine_options": {"strict_local": True}, "pages": [{"page_index": 0, "blocks": []}], "assets": [], "warnings": [], "summary": {"pages_processed": 1, "warning_count": 0}, } metadata_path.write_text(json.dumps(metadata, indent=2, sort_keys=True) + "\n", encoding="utf-8") return metadata_path def test_console_script_entry_point_is_reserved() -> None: scripts = {entry_point.name: entry_point for entry_point in entry_points(group="console_scripts")} assert scripts["pdf2md"].value == "pdf2md.cli:main" def test_cli_no_args_prints_help(capsys) -> None: assert main([]) == 0 captured = capsys.readouterr() assert "usage: pdf2md" in captured.out assert "convert" in captured.out assert "--no-strict-local" not in captured.out def test_cli_version_module_execution() -> None: completed = subprocess.run( [sys.executable, "-m", "pdf2md.cli", "--version"], check=False, capture_output=True, text=True, ) assert completed.returncode == 0 assert completed.stdout.strip() == "pdf2md 0.1.0" def test_cli_doctor_success_returns_zero(capsys) -> None: exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("python", "pass", "ok"),))) captured = capsys.readouterr() assert exit_code == 0 assert "Doctor status: PASS" in captured.out assert "[PASS] python: ok" in captured.out def test_cli_doctor_warning_only_returns_zero(capsys) -> None: exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("gpu", "warn", "missing"),))) captured = capsys.readouterr() assert exit_code == 0 assert "Doctor status: WARN" in captured.out assert "[WARN] gpu: missing" in captured.out def test_cli_doctor_failure_returns_nonzero(capsys) -> None: exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("mineru", "fail", "missing"),))) captured = capsys.readouterr() assert exit_code == 1 assert "Doctor status: FAIL" in captured.out assert "[FAIL] mineru: missing" in captured.out def test_cli_convert_single_pdf_writes_outputs_and_summary(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") out = tmp_path / "out" adapter = FakeAdapter() exit_code = main(["convert", str(pdf), "--out", str(out), "--metadata"], adapter=adapter, clock=fixed_clock) captured = capsys.readouterr() assert exit_code == 0 assert "converted: 1" in captured.out assert "failed: 0" in captured.out assert "warnings: 0" in captured.out assert (out / "paper" / "paper_001.md").exists() assert not list(out.rglob("*.metadata.json")) assert (out / "paper" / "paper_report.md").exists() assert adapter.calls == [pdf.resolve()] assert adapter.options[0].to_engine_options() == { "strict_local": True, "gpu_device": "cuda:0", "mineru_profile": { "requested": "auto", "applied": "safe", "environment": { "MINERU_API_MAX_CONCURRENT_REQUESTS": "1", "MINERU_PDF_RENDER_THREADS": "1", "MINERU_PROCESSING_WINDOW_SIZE": "1", }, }, } def test_cli_convert_accepts_safe_mineru_profile(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") adapter = FakeAdapter() exit_code = main( ["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "safe"], adapter=adapter, clock=fixed_clock, ) capsys.readouterr() assert exit_code == 0 assert adapter.options[0].to_engine_options()["mineru_profile"]["requested"] == "safe" def test_cli_convert_accepts_performance_mineru_profile(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") adapter = FakeAdapter() exit_code = main( ["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "performance"], adapter=adapter, clock=fixed_clock, ) capsys.readouterr() assert exit_code == 0 profile = adapter.options[0].to_engine_options()["mineru_profile"] assert profile["requested"] == "performance" assert profile["applied"] == "safe" def test_cli_convert_rejects_invalid_mineru_profile(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") with pytest.raises(SystemExit) as error: main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "fast"]) captured = capsys.readouterr() assert error.value.code == 2 assert "invalid choice" in captured.err def test_cli_convert_gpu_auto_selects_largest_visible_gpu(tmp_path: Path, capsys, monkeypatch) -> None: pdf = make_pdf(tmp_path, "paper.pdf") adapter = FakeAdapter() inventory = ( GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"), GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"), ) monkeypatch.setattr(conversion_module, "query_nvidia_gpus", lambda: inventory) exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--gpu", "auto"], adapter=adapter, clock=fixed_clock) capsys.readouterr() options = adapter.options[0].to_engine_options() assert exit_code == 0 assert options["gpu_device"] == "cuda:1" assert options["mineru_profile"]["selected_gpu"]["name"] == "NVIDIA RTX 4090" def test_cli_convert_directory_is_deterministic(tmp_path: Path, capsys) -> None: source = tmp_path / "pdfs" make_pdf(source, "b.pdf") make_pdf(source, "a.pdf") adapter = FakeAdapter() exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock) captured = capsys.readouterr() assert exit_code == 0 assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf"] assert "converted: 2" in captured.out assert captured.out.index("a.pdf") < captured.out.index("b.pdf") def test_cli_convert_recursive_only_when_requested(tmp_path: Path, capsys) -> None: source = tmp_path / "pdfs" make_pdf(source, "top.pdf") make_pdf(source / "nested", "child.pdf") adapter = FakeAdapter() exit_code = main( ["convert", str(source), "--out", str(tmp_path / "out"), "--recursive"], adapter=adapter, clock=fixed_clock, ) captured = capsys.readouterr() assert exit_code == 0 assert [path.name for path in adapter.calls] == ["child.pdf", "top.pdf"] assert "converted: 2" in captured.out assert (tmp_path / "out" / "nested" / "child" / "child_001.md").exists() def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") adapter = FakeAdapter(succeeded=False) exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock) captured = capsys.readouterr() assert exit_code == 1 assert "failed: 1" in captured.out assert "warnings: 1" in captured.out assert not (tmp_path / "out" / "paper.md").exists() def test_cli_recheck_markdown_regenerates_adjacent_metadata_and_report(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") out = tmp_path / "out" adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n") assert ( main( ["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock, math_checker=lambda _: False, ) == 0 ) capsys.readouterr() markdown_path = out / "paper" / "paper_001.md" markdown_path.write_text("Inline $x_i$\n", encoding="utf-8") write_legacy_metadata(markdown_path, pdf) exit_code = main(["recheck", str(markdown_path)], clock=fixed_clock, math_checker=lambda _: True) captured = capsys.readouterr() assert exit_code == 0 assert "rechecked:" in captured.out assert "warnings: 0" in captured.out assert "- Final status: `success`" in markdown_path.with_suffix(".report.md").read_text(encoding="utf-8") def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") out = tmp_path / "out" (out / "paper").mkdir(parents=True) (out / "paper" / "paper_001.md").write_text("old", encoding="utf-8") adapter = FakeAdapter() exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock) captured = capsys.readouterr() assert exit_code == 2 assert "planned outputs already exist" in captured.err assert adapter.calls == [] def test_cli_convert_chunk_pages_flag_uses_default_twenty_pages(tmp_path: Path, capsys) -> None: pdf = make_pdf_with_pages(tmp_path, "long.pdf", 21) out = tmp_path / "out" adapter = FakeAdapter() exit_code = main(["convert", str(pdf), "--out", str(out), "--chunk-pages"], adapter=adapter, clock=fixed_clock) captured = capsys.readouterr() assert exit_code == 0 assert "converted: 2" in captured.out assert len(adapter.calls) == 21 assert [path.name for path in adapter.calls[:3]] == ["long.page-001.pdf", "long.page-002.pdf", "long.page-003.pdf"] assert (out / "long" / "long_001.md").exists() assert (out / "long" / "long_002.md").exists() assert (out / "long" / "long_report.md").exists() assert not list(out.rglob("*.metadata.json")) def test_cli_convert_rejects_non_positive_chunk_pages(tmp_path: Path, capsys) -> None: pdf = make_pdf(tmp_path, "paper.pdf") with pytest.raises(SystemExit) as error: main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--chunk-pages", "0"]) captured = capsys.readouterr() assert error.value.code == 2 assert "must be a positive integer" in captured.err