add pdftomd
This commit is contained in:
@@ -0,0 +1,264 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.mineru_adapter import (
|
||||
CommandResult,
|
||||
MinerUAdapter,
|
||||
MinerUOptions,
|
||||
StrictLocalViolationError,
|
||||
)
|
||||
|
||||
|
||||
class FakeRunner:
|
||||
def __init__(self, *results: CommandResult) -> None:
|
||||
self.results = list(results)
|
||||
self.commands: list[tuple[str, ...]] = []
|
||||
|
||||
def __call__(self, command: tuple[str, ...]) -> CommandResult:
|
||||
self.commands.append(command)
|
||||
if not self.results:
|
||||
raise AssertionError("fake runner was called without a queued result")
|
||||
result = self.results.pop(0)
|
||||
return CommandResult(
|
||||
command=command,
|
||||
exit_code=result.exit_code,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
)
|
||||
|
||||
|
||||
class EnvironmentRunner:
|
||||
def __init__(self) -> None:
|
||||
self.mineru_device_mode: str | None = None
|
||||
self.cuda_visible_devices: str | None = None
|
||||
|
||||
def __call__(self, command: tuple[str, ...]) -> CommandResult:
|
||||
self.mineru_device_mode = os.environ.get("MINERU_DEVICE_MODE")
|
||||
self.cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
||||
work_dir = Path(command[command.index("-o") + 1])
|
||||
work_dir.mkdir(parents=True, exist_ok=True)
|
||||
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
|
||||
return CommandResult(command=command, exit_code=0)
|
||||
|
||||
|
||||
def available(_: str) -> str:
|
||||
return "C:/local/bin/mineru.exe"
|
||||
|
||||
|
||||
def missing(_: str) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def test_availability_check_uses_mockable_which() -> None:
|
||||
assert MinerUAdapter(which=available, runner=FakeRunner()).is_available() is True
|
||||
assert MinerUAdapter(which=missing, runner=FakeRunner()).is_available() is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("executable", ["mineru-api", "python", "C:/tools/mineru.exe"])
|
||||
def test_custom_executable_is_rejected(executable: str) -> None:
|
||||
with pytest.raises(StrictLocalViolationError):
|
||||
MinerUAdapter(executable=executable, which=available, runner=FakeRunner())
|
||||
|
||||
|
||||
def test_missing_mineru_does_not_call_runner(tmp_path: Path) -> None:
|
||||
runner = FakeRunner()
|
||||
adapter = MinerUAdapter(which=missing, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work")
|
||||
|
||||
assert result.succeeded is False
|
||||
assert result.exit_code is None
|
||||
assert runner.commands == []
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING]
|
||||
|
||||
|
||||
def test_missing_mineru_version_does_not_call_runner() -> None:
|
||||
runner = FakeRunner()
|
||||
adapter = MinerUAdapter(which=missing, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is False
|
||||
assert result.exit_code is None
|
||||
assert runner.commands == []
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING]
|
||||
|
||||
|
||||
def test_version_success_uses_stdout() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="MinerU 3.1.0\n"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is True
|
||||
assert result.version == "MinerU 3.1.0"
|
||||
assert result.command == ("mineru", "--version")
|
||||
assert runner.commands == [("mineru", "--version")]
|
||||
|
||||
|
||||
def test_version_success_can_use_stderr() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stderr="MinerU 3.1.0\n"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.version == "MinerU 3.1.0"
|
||||
|
||||
|
||||
def test_version_failure_is_explicit() -> None:
|
||||
runner = FakeRunner(CommandResult((), 2, stdout="", stderr="bad version"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.version is None
|
||||
assert result.exit_code == 2
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_version_empty_output_is_explicit() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="", stderr=""))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is True
|
||||
assert result.version is None
|
||||
assert result.exit_code == 0
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_build_command_is_list_based_and_deterministic(tmp_path: Path) -> None:
|
||||
adapter = MinerUAdapter(which=available, runner=FakeRunner())
|
||||
input_pdf = tmp_path / "논문 with spaces.pdf"
|
||||
work_dir = tmp_path / "work output"
|
||||
|
||||
command = adapter.build_command(input_pdf, work_dir)
|
||||
|
||||
assert command == ("mineru", "-p", str(input_pdf), "-o", str(work_dir))
|
||||
assert "--api-url" not in command
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
MinerUOptions(extra_cli_args=("--api-url", "http://example.test")),
|
||||
MinerUOptions(engine_options={"api_url": "http://example.test"}),
|
||||
MinerUOptions(engine_options={"base_url": "http://example.test"}),
|
||||
MinerUOptions(engine_options={"mode": "router"}),
|
||||
MinerUOptions(engine_options={"backend": "http"}),
|
||||
MinerUOptions(engine_options={"openai_base_url": "http://example.test/v1"}),
|
||||
MinerUOptions(engine_options={"endpoint": "https://example.test"}),
|
||||
MinerUOptions(engine_options={"nested": {"url": "local http://example.test"}}),
|
||||
MinerUOptions(engine_options={"process": "mineru-api"}),
|
||||
MinerUOptions(gpu_device="https://example.test/gpu"),
|
||||
MinerUOptions(strict_local=False),
|
||||
],
|
||||
)
|
||||
def test_strict_local_rejects_remote_router_and_backend_options(tmp_path: Path, options: MinerUOptions) -> None:
|
||||
adapter = MinerUAdapter(which=available, runner=FakeRunner())
|
||||
|
||||
with pytest.raises(StrictLocalViolationError):
|
||||
adapter.build_command(tmp_path / "paper.pdf", tmp_path / "work", options)
|
||||
|
||||
|
||||
def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
(work_dir / "nested").mkdir(parents=True)
|
||||
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
|
||||
(work_dir / "structured.json").write_text('{"pages": 1}', encoding="utf-8")
|
||||
(work_dir / "assets" / "z.png").parent.mkdir()
|
||||
(work_dir / "assets" / "z.png").write_bytes(b"z")
|
||||
(work_dir / "assets" / "a.png").write_bytes(b"a")
|
||||
(work_dir / "assets" / "nested").mkdir()
|
||||
(work_dir / "assets" / "nested" / "b.png").write_bytes(b"b")
|
||||
(work_dir / "zz_extra.md").write_text("not an asset", encoding="utf-8")
|
||||
(work_dir / "zz_extra.json").write_text("{}", encoding="utf-8")
|
||||
(work_dir / "run.log").write_text("diagnostic", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="ok", stderr="warn"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(
|
||||
tmp_path / "paper.pdf",
|
||||
work_dir,
|
||||
MinerUOptions(engine_version="3.1.0", gpu_device="cuda:0"),
|
||||
)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.command == ("mineru", "-p", str(tmp_path / "paper.pdf"), "-o", str(work_dir))
|
||||
assert result.raw_markdown == "# Title\n"
|
||||
assert result.raw_structured == {"pages": 1}
|
||||
assert [path.relative_to(work_dir).as_posix() for path in result.asset_paths] == [
|
||||
"assets/a.png",
|
||||
"assets/nested/b.png",
|
||||
"assets/z.png",
|
||||
]
|
||||
assert result.engine == "MinerU"
|
||||
assert result.engine_version == "3.1.0"
|
||||
assert result.engine_options == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
assert result.exit_code == 0
|
||||
assert result.stdout == "ok"
|
||||
assert result.stderr == "warn"
|
||||
|
||||
|
||||
def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_path: Path, monkeypatch) -> None:
|
||||
monkeypatch.setenv("MINERU_DEVICE_MODE", "cpu")
|
||||
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "7")
|
||||
runner = EnvironmentRunner()
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work", MinerUOptions(gpu_device="cuda:0"))
|
||||
|
||||
assert result.succeeded is True
|
||||
assert runner.mineru_device_mode == "cuda"
|
||||
assert runner.cuda_visible_devices == "0"
|
||||
assert os.environ["MINERU_DEVICE_MODE"] == "cpu"
|
||||
assert os.environ["CUDA_VISIBLE_DEVICES"] == "7"
|
||||
|
||||
|
||||
def test_nonzero_exit_does_not_parse_existing_outputs_or_fallback(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
(work_dir / "paper.md").write_text("existing output", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 3, stdout="out", stderr="failed"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is False
|
||||
assert result.raw_markdown is None
|
||||
assert result.asset_paths == ()
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_exit_zero_with_no_usable_output_warns(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
runner = FakeRunner(CommandResult((), 0))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is False
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
assert "no usable" in result.warnings[0].message
|
||||
|
||||
|
||||
def test_invalid_json_is_preserved_as_text_with_warning(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
(work_dir / "paper.md").write_text("markdown", encoding="utf-8")
|
||||
(work_dir / "structured.json").write_text("{not json", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 0))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.raw_structured == "{not json"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
Reference in New Issue
Block a user