233 lines
7.9 KiB
Python
233 lines
7.9 KiB
Python
from __future__ import annotations
|
|
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from importlib.metadata import entry_points
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from pypdf import PdfWriter
|
|
|
|
from pdf2md.cli import main
|
|
from pdf2md.doctor import DoctorCheck, DoctorReport
|
|
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
|
from pdf2md.mineru_adapter import MinerUAdapterResult
|
|
|
|
|
|
class FakeAdapter:
|
|
def __init__(self, *, succeeded: bool = True) -> None:
|
|
self.succeeded = succeeded
|
|
self.calls: list[Path] = []
|
|
self.options: list[object] = []
|
|
|
|
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
|
input_path = Path(input_pdf)
|
|
output_dir = Path(work_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
self.calls.append(input_path)
|
|
self.options.append(options)
|
|
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
|
return MinerUAdapterResult(
|
|
succeeded=self.succeeded,
|
|
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
|
input_pdf=input_path,
|
|
work_dir=output_dir,
|
|
raw_markdown=f"# {input_path.stem}\n" if self.succeeded else None,
|
|
raw_structured={"pages": 1},
|
|
asset_paths=(),
|
|
warnings=() if self.succeeded else (warning,),
|
|
engine="MinerU",
|
|
engine_version="3.1.0",
|
|
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
|
exit_code=0 if self.succeeded else 2,
|
|
stdout="",
|
|
stderr="",
|
|
)
|
|
|
|
|
|
def fixed_clock() -> datetime:
|
|
return datetime(2026, 5, 8, tzinfo=timezone.utc)
|
|
|
|
|
|
def make_pdf(directory: Path, name: str) -> Path:
|
|
path = directory / name
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_bytes(b"%PDF-1.7\n")
|
|
return path
|
|
|
|
|
|
def make_pdf_with_pages(directory: Path, name: str, page_count: int) -> Path:
|
|
path = directory / name
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
writer = PdfWriter()
|
|
for _ in range(page_count):
|
|
writer.add_blank_page(width=72, height=72)
|
|
with path.open("wb") as file:
|
|
writer.write(file)
|
|
return path
|
|
|
|
|
|
def test_console_script_entry_point_is_reserved() -> None:
|
|
scripts = {entry_point.name: entry_point for entry_point in entry_points(group="console_scripts")}
|
|
|
|
assert scripts["pdf2md"].value == "pdf2md.cli:main"
|
|
|
|
|
|
def test_cli_no_args_prints_help(capsys) -> None:
|
|
assert main([]) == 0
|
|
|
|
captured = capsys.readouterr()
|
|
assert "usage: pdf2md" in captured.out
|
|
assert "convert" in captured.out
|
|
assert "--no-strict-local" not in captured.out
|
|
|
|
|
|
def test_cli_version_module_execution() -> None:
|
|
completed = subprocess.run(
|
|
[sys.executable, "-m", "pdf2md.cli", "--version"],
|
|
check=False,
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
assert completed.returncode == 0
|
|
assert completed.stdout.strip() == "pdf2md 0.1.0"
|
|
|
|
|
|
def test_cli_doctor_success_returns_zero(capsys) -> None:
|
|
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("python", "pass", "ok"),)))
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 0
|
|
assert "Doctor status: PASS" in captured.out
|
|
assert "[PASS] python: ok" in captured.out
|
|
|
|
|
|
def test_cli_doctor_warning_only_returns_zero(capsys) -> None:
|
|
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("gpu", "warn", "missing"),)))
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 0
|
|
assert "Doctor status: WARN" in captured.out
|
|
assert "[WARN] gpu: missing" in captured.out
|
|
|
|
|
|
def test_cli_doctor_failure_returns_nonzero(capsys) -> None:
|
|
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("mineru", "fail", "missing"),)))
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 1
|
|
assert "Doctor status: FAIL" in captured.out
|
|
assert "[FAIL] mineru: missing" in captured.out
|
|
|
|
|
|
def test_cli_convert_single_pdf_writes_outputs_and_summary(tmp_path: Path, capsys) -> None:
|
|
pdf = make_pdf(tmp_path, "paper.pdf")
|
|
out = tmp_path / "out"
|
|
adapter = FakeAdapter()
|
|
|
|
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 0
|
|
assert "converted: 1" in captured.out
|
|
assert "failed: 0" in captured.out
|
|
assert "warnings: 0" in captured.out
|
|
assert (out / "paper.md").exists()
|
|
assert (out / "paper.metadata.json").exists()
|
|
assert (out / "paper.report.md").exists()
|
|
assert adapter.calls == [pdf.resolve()]
|
|
assert adapter.options[0].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
|
|
|
|
|
def test_cli_convert_directory_is_deterministic(tmp_path: Path, capsys) -> None:
|
|
source = tmp_path / "pdfs"
|
|
make_pdf(source, "b.pdf")
|
|
make_pdf(source, "a.pdf")
|
|
adapter = FakeAdapter()
|
|
|
|
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 0
|
|
assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf"]
|
|
assert "converted: 2" in captured.out
|
|
assert captured.out.index("a.pdf") < captured.out.index("b.pdf")
|
|
|
|
|
|
def test_cli_convert_recursive_only_when_requested(tmp_path: Path, capsys) -> None:
|
|
source = tmp_path / "pdfs"
|
|
make_pdf(source, "top.pdf")
|
|
make_pdf(source / "nested", "child.pdf")
|
|
adapter = FakeAdapter()
|
|
|
|
exit_code = main(
|
|
["convert", str(source), "--out", str(tmp_path / "out"), "--recursive"],
|
|
adapter=adapter,
|
|
clock=fixed_clock,
|
|
)
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 0
|
|
assert [path.name for path in adapter.calls] == ["child.pdf", "top.pdf"]
|
|
assert "converted: 2" in captured.out
|
|
assert (tmp_path / "out" / "nested" / "child.md").exists()
|
|
|
|
|
|
def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None:
|
|
pdf = make_pdf(tmp_path, "paper.pdf")
|
|
adapter = FakeAdapter(succeeded=False)
|
|
|
|
exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 1
|
|
assert "failed: 1" in captured.out
|
|
assert "warnings: 1" in captured.out
|
|
assert not (tmp_path / "out" / "paper.md").exists()
|
|
|
|
|
|
def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None:
|
|
pdf = make_pdf(tmp_path, "paper.pdf")
|
|
out = tmp_path / "out"
|
|
out.mkdir()
|
|
(out / "paper.md").write_text("old", encoding="utf-8")
|
|
adapter = FakeAdapter()
|
|
|
|
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 2
|
|
assert "planned outputs already exist" in captured.err
|
|
assert adapter.calls == []
|
|
|
|
|
|
def test_cli_convert_chunk_pages_flag_uses_default_twenty_pages(tmp_path: Path, capsys) -> None:
|
|
pdf = make_pdf_with_pages(tmp_path, "long.pdf", 21)
|
|
out = tmp_path / "out"
|
|
adapter = FakeAdapter()
|
|
|
|
exit_code = main(["convert", str(pdf), "--out", str(out), "--chunk-pages"], adapter=adapter, clock=fixed_clock)
|
|
|
|
captured = capsys.readouterr()
|
|
assert exit_code == 0
|
|
assert "converted: 2" in captured.out
|
|
assert [path.name for path in adapter.calls] == [
|
|
"long.part-001.pages-001-020.pdf",
|
|
"long.part-002.pages-021-021.pdf",
|
|
]
|
|
assert (out / "long.part-001.pages-001-020.md").exists()
|
|
assert (out / "long.part-002.pages-021-021.md").exists()
|
|
|
|
|
|
def test_cli_convert_rejects_non_positive_chunk_pages(tmp_path: Path, capsys) -> None:
|
|
pdf = make_pdf(tmp_path, "paper.pdf")
|
|
|
|
with pytest.raises(SystemExit) as error:
|
|
main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--chunk-pages", "0"])
|
|
|
|
captured = capsys.readouterr()
|
|
assert error.value.code == 2
|
|
assert "must be a positive integer" in captured.err
|