add pdftomd
This commit is contained in:
@@ -0,0 +1,311 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.doctor import DoctorCommandResult, DoctorReport, format_doctor_report, run_doctor
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.math_render import default_mathjax_helper_path
|
||||
from pdf2md.mineru_adapter import MinerUVersionResult
|
||||
|
||||
|
||||
class FakeMinerUProbe:
|
||||
def __init__(self, result: MinerUVersionResult) -> None:
|
||||
self.result = result
|
||||
|
||||
def version(self) -> MinerUVersionResult:
|
||||
return self.result
|
||||
|
||||
|
||||
class FakeCuda:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
available: bool = True,
|
||||
devices: tuple[str, ...] = ("NVIDIA RTX 4060",),
|
||||
capabilities: tuple[tuple[int, int], ...] = ((8, 9),),
|
||||
) -> None:
|
||||
self._available = available
|
||||
self._devices = devices
|
||||
self._capabilities = capabilities
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return self._available
|
||||
|
||||
def device_count(self) -> int:
|
||||
return len(self._devices)
|
||||
|
||||
def get_device_name(self, index: int) -> str:
|
||||
return self._devices[index]
|
||||
|
||||
def get_device_capability(self, index: int) -> tuple[int, int]:
|
||||
return self._capabilities[index]
|
||||
|
||||
|
||||
class FakeTorchVersion:
|
||||
cuda = "12.8"
|
||||
|
||||
|
||||
class FakeTorch:
|
||||
__version__ = "2.8.0+cu128"
|
||||
version = FakeTorchVersion()
|
||||
|
||||
def __init__(self, cuda: FakeCuda) -> None:
|
||||
self.cuda = cuda
|
||||
|
||||
|
||||
def test_doctor_all_checks_pass_with_mocked_tools(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
env={"HF_HOME": str(tmp_path / "hf")},
|
||||
existing_paths={tmp_path / "hf"},
|
||||
)
|
||||
|
||||
assert report.status == "pass"
|
||||
assert report.exit_code == 0
|
||||
assert [check.name for check in report.checks] == [
|
||||
"python",
|
||||
"uv",
|
||||
"mineru",
|
||||
"gpu",
|
||||
"pytorch",
|
||||
"models",
|
||||
"mathjax",
|
||||
"local-only",
|
||||
]
|
||||
|
||||
|
||||
def test_doctor_fails_outside_python_312(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, python_version=(3, 11, 9))
|
||||
|
||||
python_check = find_check(report, "python")
|
||||
assert report.status == "fail"
|
||||
assert python_check.status == "fail"
|
||||
assert "use Python 3.12.x" in python_check.message
|
||||
|
||||
|
||||
def test_doctor_fails_when_uv_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, available_tools={"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe"})
|
||||
|
||||
uv_check = find_check(report, "uv")
|
||||
assert report.status == "fail"
|
||||
assert uv_check.status == "fail"
|
||||
assert "uv executable was not found" in uv_check.message
|
||||
|
||||
|
||||
def test_doctor_fails_when_mineru_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=False,
|
||||
version=None,
|
||||
command=("mineru", "--version"),
|
||||
exit_code=None,
|
||||
stdout="",
|
||||
stderr="",
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "fail"
|
||||
assert report.exit_code == 1
|
||||
assert mineru_check.status == "fail"
|
||||
assert "MinerU CLI executable was not found" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mineru_version_command_fails(tmp_path: Path) -> None:
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU version command failed.")
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=True,
|
||||
version=None,
|
||||
command=("mineru", "--version"),
|
||||
exit_code=2,
|
||||
stdout="",
|
||||
stderr="boom",
|
||||
warnings=(warning,),
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "warn"
|
||||
assert mineru_check.status == "warn"
|
||||
assert "version could not be detected" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mineru_version_is_not_target(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=True,
|
||||
version="mineru, version 3.1.8",
|
||||
command=("mineru", "--version"),
|
||||
exit_code=0,
|
||||
stdout="mineru, version 3.1.8",
|
||||
stderr="",
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "warn"
|
||||
assert mineru_check.status == "warn"
|
||||
assert "project target is 3.1.0" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_gpu_and_pytorch_are_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
available_tools={"uv": "C:/Users/user/.local/bin/uv.exe"},
|
||||
import_module=missing_torch,
|
||||
)
|
||||
|
||||
assert report.status == "warn"
|
||||
assert find_check(report, "gpu").status == "warn"
|
||||
assert find_check(report, "pytorch").status == "warn"
|
||||
|
||||
|
||||
def test_doctor_warns_for_gtx_1070_ti_pascal_risk(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
|
||||
|
||||
gpu_check = find_check(report, "gpu")
|
||||
assert report.status == "warn"
|
||||
assert gpu_check.status == "warn"
|
||||
assert "Pascal/pre-Turing compatibility risk" in gpu_check.message
|
||||
assert any("GTX 1070 Ti" in detail for detail in gpu_check.details)
|
||||
|
||||
|
||||
def test_doctor_warns_for_pytorch_pre_turing_capability(tmp_path: Path) -> None:
|
||||
def fake_pascal_torch(name: str) -> FakeTorch:
|
||||
assert name == "torch"
|
||||
return FakeTorch(FakeCuda(devices=("NVIDIA GeForce GTX 1070 Ti",), capabilities=((6, 1),)))
|
||||
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
gpu_stdout="NVIDIA RTX 4060, 8192 MiB, 551.86\n",
|
||||
import_module=fake_pascal_torch,
|
||||
)
|
||||
|
||||
pytorch_check = find_check(report, "pytorch")
|
||||
assert report.status == "warn"
|
||||
assert pytorch_check.status == "warn"
|
||||
assert "Pascal/pre-Turing compatibility risk" in pytorch_check.message
|
||||
assert any("compute capability 6.1" in detail for detail in pytorch_check.details)
|
||||
|
||||
|
||||
def test_doctor_warns_when_model_cache_is_not_detected(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, env={}, existing_paths=set())
|
||||
|
||||
models_check = find_check(report, "models")
|
||||
assert report.status == "warn"
|
||||
assert models_check.status == "warn"
|
||||
assert "No MinerU model/cache/config path" in models_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mathjax_node_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
available_tools={
|
||||
"uv": "C:/Users/user/.local/bin/uv.exe",
|
||||
"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe",
|
||||
},
|
||||
)
|
||||
|
||||
mathjax_check = find_check(report, "mathjax")
|
||||
assert report.status == "warn"
|
||||
assert mathjax_check.status == "warn"
|
||||
assert "Node.js executable was not found" in mathjax_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mathjax_health_fails(tmp_path: Path) -> None:
|
||||
def failing_runner(command: tuple[str, ...]) -> DoctorCommandResult:
|
||||
if command[-1] == "--health":
|
||||
return DoctorCommandResult(command, 1, stderr="Cannot find package 'mathjax'")
|
||||
return command_runner("NVIDIA RTX 4060, 8192 MiB, 551.86\n")(command)
|
||||
|
||||
report = make_report(tmp_path, run_command=failing_runner)
|
||||
|
||||
mathjax_check = find_check(report, "mathjax")
|
||||
assert report.status == "warn"
|
||||
assert mathjax_check.status == "warn"
|
||||
assert "unavailable" in mathjax_check.message
|
||||
assert any("mathjax" in detail for detail in mathjax_check.details)
|
||||
|
||||
|
||||
def test_format_doctor_report_is_stable(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
|
||||
|
||||
formatted = format_doctor_report(report)
|
||||
|
||||
assert formatted.startswith("Doctor status: WARN\n")
|
||||
assert "[WARN] gpu:" in formatted
|
||||
assert "[PASS] local-only:" in formatted
|
||||
|
||||
|
||||
def make_report(
|
||||
tmp_path: Path,
|
||||
*,
|
||||
python_version: tuple[int, int, int] = (3, 12, 7),
|
||||
available_tools: dict[str, str] | None = None,
|
||||
mineru_result: MinerUVersionResult | None = None,
|
||||
gpu_stdout: str = "NVIDIA RTX 4060, 8192 MiB, 551.86\n",
|
||||
env: dict[str, str] | None = None,
|
||||
existing_paths: set[Path] | None = None,
|
||||
import_module=None,
|
||||
run_command=None,
|
||||
) -> DoctorReport:
|
||||
tools = available_tools or {
|
||||
"uv": "C:/Users/user/.local/bin/uv.exe",
|
||||
"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe",
|
||||
"node": "C:/Program Files/nodejs/node.exe",
|
||||
}
|
||||
result = mineru_result or MinerUVersionResult(
|
||||
available=True,
|
||||
version="mineru, version 3.1.0",
|
||||
command=("mineru", "--version"),
|
||||
exit_code=0,
|
||||
stdout="mineru, version 3.1.0",
|
||||
stderr="",
|
||||
)
|
||||
environment = env if env is not None else {"HF_HOME": str(tmp_path / "hf")}
|
||||
paths = set(existing_paths if existing_paths is not None else {tmp_path / "hf"})
|
||||
paths.add(default_mathjax_helper_path())
|
||||
|
||||
return run_doctor(
|
||||
python_version=python_version,
|
||||
which=lambda executable: tools.get(executable),
|
||||
run_command=run_command or command_runner(gpu_stdout),
|
||||
import_module=import_module or fake_torch,
|
||||
env=environment,
|
||||
path_exists=lambda path: path in paths,
|
||||
home=tmp_path,
|
||||
mineru_probe=FakeMinerUProbe(result),
|
||||
)
|
||||
|
||||
|
||||
def command_runner(gpu_stdout: str):
|
||||
def run(command: tuple[str, ...]) -> DoctorCommandResult:
|
||||
if command == ("uv", "--version"):
|
||||
return DoctorCommandResult(command, 0, stdout="uv 0.8.13\n")
|
||||
if command and command[0] == "nvidia-smi":
|
||||
return DoctorCommandResult(command, 0, stdout=gpu_stdout)
|
||||
if len(command) == 2 and command[1] == "--version" and command[0].endswith("node.exe"):
|
||||
return DoctorCommandResult(command, 0, stdout="v24.13.0\n")
|
||||
if command and command[-1] == "--health":
|
||||
return DoctorCommandResult(command, 0, stdout='{"ok":true}\n')
|
||||
return DoctorCommandResult(command, 127, stderr="not found")
|
||||
|
||||
return run
|
||||
|
||||
|
||||
def fake_torch(name: str) -> FakeTorch:
|
||||
assert name == "torch"
|
||||
return FakeTorch(FakeCuda())
|
||||
|
||||
|
||||
def missing_torch(name: str):
|
||||
assert name == "torch"
|
||||
raise ImportError(name)
|
||||
|
||||
|
||||
def find_check(report: DoctorReport, name: str):
|
||||
return next(check for check in report.checks if check.name == name)
|
||||
Reference in New Issue
Block a user