modify pdftomd
This commit is contained in:
@@ -1,17 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.gpu import GpuInfo
|
||||
from pdf2md.mineru_adapter import (
|
||||
CommandResult,
|
||||
MinerUAdapter,
|
||||
MinerUOptions,
|
||||
StrictLocalViolationError,
|
||||
_run_command,
|
||||
)
|
||||
from pdf2md.mineru_profile import resolve_mineru_profile
|
||||
|
||||
|
||||
class FakeRunner:
|
||||
@@ -36,10 +40,16 @@ class EnvironmentRunner:
|
||||
def __init__(self) -> None:
|
||||
self.mineru_device_mode: str | None = None
|
||||
self.cuda_visible_devices: str | None = None
|
||||
self.processing_window_size: str | None = None
|
||||
self.max_concurrent_requests: str | None = None
|
||||
self.pdf_render_threads: str | None = None
|
||||
|
||||
def __call__(self, command: tuple[str, ...]) -> CommandResult:
|
||||
self.mineru_device_mode = os.environ.get("MINERU_DEVICE_MODE")
|
||||
self.cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
||||
self.processing_window_size = os.environ.get("MINERU_PROCESSING_WINDOW_SIZE")
|
||||
self.max_concurrent_requests = os.environ.get("MINERU_API_MAX_CONCURRENT_REQUESTS")
|
||||
self.pdf_render_threads = os.environ.get("MINERU_PDF_RENDER_THREADS")
|
||||
work_dir = Path(command[command.index("-o") + 1])
|
||||
work_dir.mkdir(parents=True, exist_ok=True)
|
||||
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
|
||||
@@ -133,6 +143,20 @@ def test_version_empty_output_is_explicit() -> None:
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_default_runner_decodes_utf8_process_output() -> None:
|
||||
code = (
|
||||
"import sys; "
|
||||
"sys.stdout.buffer.write('stdout ∙\\n'.encode('utf-8')); "
|
||||
"sys.stderr.buffer.write('stderr ∙\\n'.encode('utf-8'))"
|
||||
)
|
||||
|
||||
result = _run_command((sys.executable, "-c", code))
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert result.stdout == "stdout ∙\n"
|
||||
assert result.stderr == "stderr ∙\n"
|
||||
|
||||
|
||||
def test_build_command_is_list_based_and_deterministic(tmp_path: Path) -> None:
|
||||
adapter = MinerUAdapter(which=available, runner=FakeRunner())
|
||||
input_pdf = tmp_path / "논문 with spaces.pdf"
|
||||
@@ -200,7 +224,15 @@ def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path
|
||||
]
|
||||
assert result.engine == "MinerU"
|
||||
assert result.engine_version == "3.1.0"
|
||||
assert result.engine_options == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
assert result.engine_options == {
|
||||
"strict_local": True,
|
||||
"gpu_device": "cuda:0",
|
||||
"mineru_profile": {
|
||||
"requested": "auto",
|
||||
"applied": "auto",
|
||||
"environment": {},
|
||||
},
|
||||
}
|
||||
assert result.exit_code == 0
|
||||
assert result.stdout == "ok"
|
||||
assert result.stderr == "warn"
|
||||
@@ -209,6 +241,7 @@ def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path
|
||||
def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_path: Path, monkeypatch) -> None:
|
||||
monkeypatch.setenv("MINERU_DEVICE_MODE", "cpu")
|
||||
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "7")
|
||||
monkeypatch.setenv("MINERU_PROCESSING_WINDOW_SIZE", "99")
|
||||
runner = EnvironmentRunner()
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
@@ -219,6 +252,53 @@ def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_pat
|
||||
assert runner.cuda_visible_devices == "0"
|
||||
assert os.environ["MINERU_DEVICE_MODE"] == "cpu"
|
||||
assert os.environ["CUDA_VISIBLE_DEVICES"] == "7"
|
||||
assert os.environ["MINERU_PROCESSING_WINDOW_SIZE"] == "99"
|
||||
|
||||
|
||||
def test_profile_option_sets_allowlisted_mineru_environment_and_engine_options(tmp_path: Path) -> None:
|
||||
gpu = GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00")
|
||||
profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True)
|
||||
runner = EnvironmentRunner()
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(
|
||||
tmp_path / "paper.pdf",
|
||||
tmp_path / "work",
|
||||
MinerUOptions(
|
||||
gpu_device="cuda:1",
|
||||
mineru_profile="performance",
|
||||
profile_environment=profile.environment,
|
||||
profile_engine_options=profile.to_engine_options(),
|
||||
),
|
||||
)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert runner.mineru_device_mode == "cuda"
|
||||
assert runner.cuda_visible_devices == "1"
|
||||
assert runner.processing_window_size == "16"
|
||||
assert runner.max_concurrent_requests == "1"
|
||||
assert runner.pdf_render_threads == "4"
|
||||
assert result.engine_options["mineru_profile"]["applied"] == "performance"
|
||||
|
||||
|
||||
def test_profile_warnings_are_preserved_in_adapter_result(tmp_path: Path) -> None:
|
||||
gpu = GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00")
|
||||
profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True)
|
||||
adapter = MinerUAdapter(which=available, runner=EnvironmentRunner())
|
||||
|
||||
result = adapter.convert(
|
||||
tmp_path / "paper.pdf",
|
||||
tmp_path / "work",
|
||||
MinerUOptions(
|
||||
gpu_device="cuda:0",
|
||||
mineru_profile="performance",
|
||||
profile_environment=profile.environment,
|
||||
profile_engine_options=profile.to_engine_options(),
|
||||
profile_warnings=profile.warnings,
|
||||
),
|
||||
)
|
||||
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_PROFILE_ADJUSTED]
|
||||
|
||||
|
||||
def test_nonzero_exit_does_not_parse_existing_outputs_or_fallback(tmp_path: Path) -> None:
|
||||
|
||||
Reference in New Issue
Block a user