from __future__ import annotations import os import sys from pathlib import Path import pytest from pdf2md.ir import WarningCode from pdf2md.gpu import GpuInfo from pdf2md.mineru_adapter import ( CommandResult, MinerUAdapter, MinerUOptions, StrictLocalViolationError, _run_command, ) from pdf2md.mineru_profile import resolve_mineru_profile class FakeRunner: def __init__(self, *results: CommandResult) -> None: self.results = list(results) self.commands: list[tuple[str, ...]] = [] def __call__(self, command: tuple[str, ...]) -> CommandResult: self.commands.append(command) if not self.results: raise AssertionError("fake runner was called without a queued result") result = self.results.pop(0) return CommandResult( command=command, exit_code=result.exit_code, stdout=result.stdout, stderr=result.stderr, ) class EnvironmentRunner: def __init__(self) -> None: self.mineru_device_mode: str | None = None self.cuda_visible_devices: str | None = None self.processing_window_size: str | None = None self.max_concurrent_requests: str | None = None self.pdf_render_threads: str | None = None def __call__(self, command: tuple[str, ...]) -> CommandResult: self.mineru_device_mode = os.environ.get("MINERU_DEVICE_MODE") self.cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES") self.processing_window_size = os.environ.get("MINERU_PROCESSING_WINDOW_SIZE") self.max_concurrent_requests = os.environ.get("MINERU_API_MAX_CONCURRENT_REQUESTS") self.pdf_render_threads = os.environ.get("MINERU_PDF_RENDER_THREADS") work_dir = Path(command[command.index("-o") + 1]) work_dir.mkdir(parents=True, exist_ok=True) (work_dir / "paper.md").write_text("# Title\n", encoding="utf-8") return CommandResult(command=command, exit_code=0) def available(_: str) -> str: return "C:/local/bin/mineru.exe" def missing(_: str) -> None: return None def test_availability_check_uses_mockable_which() -> None: assert MinerUAdapter(which=available, runner=FakeRunner()).is_available() is True assert MinerUAdapter(which=missing, runner=FakeRunner()).is_available() is False @pytest.mark.parametrize("executable", ["mineru-api", "python", "C:/tools/mineru.exe"]) def test_custom_executable_is_rejected(executable: str) -> None: with pytest.raises(StrictLocalViolationError): MinerUAdapter(executable=executable, which=available, runner=FakeRunner()) def test_missing_mineru_does_not_call_runner(tmp_path: Path) -> None: runner = FakeRunner() adapter = MinerUAdapter(which=missing, runner=runner) result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work") assert result.succeeded is False assert result.exit_code is None assert runner.commands == [] assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING] def test_missing_mineru_version_does_not_call_runner() -> None: runner = FakeRunner() adapter = MinerUAdapter(which=missing, runner=runner) result = adapter.version() assert result.available is False assert result.exit_code is None assert runner.commands == [] assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING] def test_version_success_uses_stdout() -> None: runner = FakeRunner(CommandResult((), 0, stdout="MinerU 3.1.0\n")) adapter = MinerUAdapter(which=available, runner=runner) result = adapter.version() assert result.available is True assert result.version == "MinerU 3.1.0" assert result.command == ("mineru", "--version") assert runner.commands == [("mineru", "--version")] def test_version_success_can_use_stderr() -> None: runner = FakeRunner(CommandResult((), 0, stderr="MinerU 3.1.0\n")) adapter = MinerUAdapter(which=available, runner=runner) result = adapter.version() assert result.version == "MinerU 3.1.0" def test_version_failure_is_explicit() -> None: runner = FakeRunner(CommandResult((), 2, stdout="", stderr="bad version")) adapter = MinerUAdapter(which=available, runner=runner) result = adapter.version() assert result.version is None assert result.exit_code == 2 assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED] def test_version_empty_output_is_explicit() -> None: runner = FakeRunner(CommandResult((), 0, stdout="", stderr="")) adapter = MinerUAdapter(which=available, runner=runner) result = adapter.version() assert result.available is True assert result.version is None assert result.exit_code == 0 assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED] def test_default_runner_decodes_utf8_process_output() -> None: code = ( "import sys; " "sys.stdout.buffer.write('stdout ∙\\n'.encode('utf-8')); " "sys.stderr.buffer.write('stderr ∙\\n'.encode('utf-8'))" ) result = _run_command((sys.executable, "-c", code)) assert result.exit_code == 0 assert result.stdout == "stdout ∙\n" assert result.stderr == "stderr ∙\n" def test_build_command_is_list_based_and_deterministic(tmp_path: Path) -> None: adapter = MinerUAdapter(which=available, runner=FakeRunner()) input_pdf = tmp_path / "논문 with spaces.pdf" work_dir = tmp_path / "work output" command = adapter.build_command(input_pdf, work_dir) assert command == ("mineru", "-p", str(input_pdf), "-o", str(work_dir)) assert "--api-url" not in command @pytest.mark.parametrize( "options", [ MinerUOptions(extra_cli_args=("--api-url", "http://example.test")), MinerUOptions(engine_options={"api_url": "http://example.test"}), MinerUOptions(engine_options={"base_url": "http://example.test"}), MinerUOptions(engine_options={"mode": "router"}), MinerUOptions(engine_options={"backend": "http"}), MinerUOptions(engine_options={"openai_base_url": "http://example.test/v1"}), MinerUOptions(engine_options={"endpoint": "https://example.test"}), MinerUOptions(engine_options={"nested": {"url": "local http://example.test"}}), MinerUOptions(engine_options={"process": "mineru-api"}), MinerUOptions(gpu_device="https://example.test/gpu"), MinerUOptions(strict_local=False), ], ) def test_strict_local_rejects_remote_router_and_backend_options(tmp_path: Path, options: MinerUOptions) -> None: adapter = MinerUAdapter(which=available, runner=FakeRunner()) with pytest.raises(StrictLocalViolationError): adapter.build_command(tmp_path / "paper.pdf", tmp_path / "work", options) def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path) -> None: work_dir = tmp_path / "work" (work_dir / "nested").mkdir(parents=True) (work_dir / "paper.md").write_text("# Title\n", encoding="utf-8") (work_dir / "structured.json").write_text('{"pages": 1}', encoding="utf-8") (work_dir / "assets" / "z.png").parent.mkdir() (work_dir / "assets" / "z.png").write_bytes(b"z") (work_dir / "assets" / "a.png").write_bytes(b"a") (work_dir / "assets" / "nested").mkdir() (work_dir / "assets" / "nested" / "b.png").write_bytes(b"b") (work_dir / "zz_extra.md").write_text("not an asset", encoding="utf-8") (work_dir / "zz_extra.json").write_text("{}", encoding="utf-8") (work_dir / "run.log").write_text("diagnostic", encoding="utf-8") runner = FakeRunner(CommandResult((), 0, stdout="ok", stderr="warn")) adapter = MinerUAdapter(which=available, runner=runner) result = adapter.convert( tmp_path / "paper.pdf", work_dir, MinerUOptions(engine_version="3.1.0", gpu_device="cuda:0"), ) assert result.succeeded is True assert result.command == ("mineru", "-p", str(tmp_path / "paper.pdf"), "-o", str(work_dir)) assert result.raw_markdown == "# Title\n" assert result.raw_structured == {"pages": 1} assert [path.relative_to(work_dir).as_posix() for path in result.asset_paths] == [ "assets/a.png", "assets/nested/b.png", "assets/z.png", ] assert result.engine == "MinerU" assert result.engine_version == "3.1.0" assert result.engine_options == { "strict_local": True, "gpu_device": "cuda:0", "mineru_profile": { "requested": "auto", "applied": "auto", "environment": {}, }, } assert result.exit_code == 0 assert result.stdout == "ok" assert result.stderr == "warn" def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_path: Path, monkeypatch) -> None: monkeypatch.setenv("MINERU_DEVICE_MODE", "cpu") monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "7") monkeypatch.setenv("MINERU_PROCESSING_WINDOW_SIZE", "99") runner = EnvironmentRunner() adapter = MinerUAdapter(which=available, runner=runner) result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work", MinerUOptions(gpu_device="cuda:0")) assert result.succeeded is True assert runner.mineru_device_mode == "cuda" assert runner.cuda_visible_devices == "0" assert os.environ["MINERU_DEVICE_MODE"] == "cpu" assert os.environ["CUDA_VISIBLE_DEVICES"] == "7" assert os.environ["MINERU_PROCESSING_WINDOW_SIZE"] == "99" def test_profile_option_sets_allowlisted_mineru_environment_and_engine_options(tmp_path: Path) -> None: gpu = GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00") profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True) runner = EnvironmentRunner() adapter = MinerUAdapter(which=available, runner=runner) result = adapter.convert( tmp_path / "paper.pdf", tmp_path / "work", MinerUOptions( gpu_device="cuda:1", mineru_profile="performance", profile_environment=profile.environment, profile_engine_options=profile.to_engine_options(), ), ) assert result.succeeded is True assert runner.mineru_device_mode == "cuda" assert runner.cuda_visible_devices == "1" assert runner.processing_window_size == "16" assert runner.max_concurrent_requests == "1" assert runner.pdf_render_threads == "4" assert result.engine_options["mineru_profile"]["applied"] == "performance" def test_profile_warnings_are_preserved_in_adapter_result(tmp_path: Path) -> None: gpu = GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00") profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True) adapter = MinerUAdapter(which=available, runner=EnvironmentRunner()) result = adapter.convert( tmp_path / "paper.pdf", tmp_path / "work", MinerUOptions( gpu_device="cuda:0", mineru_profile="performance", profile_environment=profile.environment, profile_engine_options=profile.to_engine_options(), profile_warnings=profile.warnings, ), ) assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_PROFILE_ADJUSTED] def test_nonzero_exit_does_not_parse_existing_outputs_or_fallback(tmp_path: Path) -> None: work_dir = tmp_path / "work" work_dir.mkdir() (work_dir / "paper.md").write_text("existing output", encoding="utf-8") runner = FakeRunner(CommandResult((), 3, stdout="out", stderr="failed")) adapter = MinerUAdapter(which=available, runner=runner) result = adapter.convert(tmp_path / "paper.pdf", work_dir) assert result.succeeded is False assert result.raw_markdown is None assert result.asset_paths == () assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED] def test_exit_zero_with_no_usable_output_warns(tmp_path: Path) -> None: work_dir = tmp_path / "work" work_dir.mkdir() runner = FakeRunner(CommandResult((), 0)) adapter = MinerUAdapter(which=available, runner=runner) result = adapter.convert(tmp_path / "paper.pdf", work_dir) assert result.succeeded is False assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED] assert "no usable" in result.warnings[0].message def test_invalid_json_is_preserved_as_text_with_warning(tmp_path: Path) -> None: work_dir = tmp_path / "work" work_dir.mkdir() (work_dir / "paper.md").write_text("markdown", encoding="utf-8") (work_dir / "structured.json").write_text("{not json", encoding="utf-8") runner = FakeRunner(CommandResult((), 0)) adapter = MinerUAdapter(which=available, runner=runner) result = adapter.convert(tmp_path / "paper.pdf", work_dir) assert result.succeeded is True assert result.raw_structured == "{not json" assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]