add pdftomd
This commit is contained in:
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
if os.environ.get("PDF2MD_RUN_MINERU_FIXTURES") != "1":
|
||||
pytest.skip(
|
||||
"optional local MinerU fixture evaluation is disabled; set PDF2MD_RUN_MINERU_FIXTURES=1 to run",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
SAMPLES_DIR = REPO_ROOT / "samples"
|
||||
|
||||
|
||||
def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -> None:
|
||||
doctor = subprocess.run(
|
||||
[sys.executable, "-m", "pdf2md.cli", "doctor"],
|
||||
cwd=REPO_ROOT,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if doctor.returncode != 0:
|
||||
pytest.skip(f"local MinerU fixture evaluation blocked by doctor:\n{doctor.stdout}\n{doctor.stderr}")
|
||||
|
||||
sample_pdfs = tuple(sorted(SAMPLES_DIR.glob("*.pdf"), key=lambda path: path.name.casefold()))
|
||||
if not sample_pdfs:
|
||||
pytest.skip(f"no local sample PDFs found under {SAMPLES_DIR}")
|
||||
|
||||
output_root = tmp_path / "mineru-fixture-output"
|
||||
attempts: list[dict[str, object]] = []
|
||||
for pdf in sample_pdfs:
|
||||
sample_output = output_root / pdf.stem
|
||||
completed = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"pdf2md.cli",
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
],
|
||||
cwd=REPO_ROOT,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=1800,
|
||||
)
|
||||
attempts.append(
|
||||
{
|
||||
"source": str(pdf.relative_to(REPO_ROOT)),
|
||||
"command": " ".join(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"pdf2md.cli",
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
]
|
||||
),
|
||||
"exit_code": completed.returncode,
|
||||
"stdout": completed.stdout,
|
||||
"stderr": completed.stderr,
|
||||
}
|
||||
)
|
||||
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
|
||||
|
||||
markdown_path = sample_output / f"{pdf.stem}.md"
|
||||
metadata_path = sample_output / f"{pdf.stem}.metadata.json"
|
||||
report_path = sample_output / f"{pdf.stem}.report.md"
|
||||
assert markdown_path.exists()
|
||||
assert metadata_path.exists()
|
||||
assert report_path.exists()
|
||||
|
||||
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
summary = metadata["summary"]
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert summary["pages_processed"] >= 1
|
||||
assert "warning_count" in summary
|
||||
assert "math_render_error_count" in summary
|
||||
assert "asset_count" in summary
|
||||
report = report_path.read_text(encoding="utf-8")
|
||||
assert "Output Markdown:" in report
|
||||
assert "Metadata JSON:" in report
|
||||
assert "Report Markdown:" in report
|
||||
attempts[-1].update(
|
||||
{
|
||||
"markdown_path": str(markdown_path),
|
||||
"metadata_path": str(metadata_path),
|
||||
"report_path": str(report_path),
|
||||
"warning_count": summary["warning_count"],
|
||||
"final_status": _report_final_status(report),
|
||||
"math_render_error_count": summary["math_render_error_count"],
|
||||
"asset_count": summary["asset_count"],
|
||||
"pages_processed": summary["pages_processed"],
|
||||
}
|
||||
)
|
||||
|
||||
record_path = output_root / "fixture-evaluation.json"
|
||||
record_path.write_text(json.dumps({"attempts": attempts}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
assert record_path.exists()
|
||||
|
||||
|
||||
def _report_final_status(report: str) -> str:
|
||||
match = re.search(r"^- Final status: `(?P<status>[^`]+)`$", report, re.MULTILINE)
|
||||
return match.group("status") if match else "unavailable"
|
||||
@@ -0,0 +1,152 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.cli import main
|
||||
from pdf2md.conversion import convert_pdf
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult
|
||||
|
||||
|
||||
class FixtureAdapter:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
raw_markdown: str,
|
||||
raw_structured: object | None = None,
|
||||
succeeded: bool = True,
|
||||
asset_name: str | None = None,
|
||||
warnings: tuple[WarningRecord, ...] = (),
|
||||
) -> None:
|
||||
self.raw_markdown = raw_markdown
|
||||
self.raw_structured = raw_structured
|
||||
self.succeeded = succeeded
|
||||
self.asset_name = asset_name
|
||||
self.warnings = warnings
|
||||
self.calls: list[tuple[Path, Path]] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append((input_path, output_dir))
|
||||
asset_paths: tuple[Path, ...] = ()
|
||||
if self.asset_name is not None:
|
||||
asset_path = output_dir / "assets" / self.asset_name
|
||||
asset_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
asset_path.write_bytes(b"fake image")
|
||||
asset_paths = (asset_path,)
|
||||
failure = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=self.succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=self.raw_markdown if self.succeeded else None,
|
||||
raw_structured=self.raw_structured,
|
||||
asset_paths=asset_paths,
|
||||
warnings=self.warnings if self.succeeded else (failure,),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if self.succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
def fixed_clock() -> datetime:
|
||||
return datetime(2026, 5, 8, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def make_pdf(directory: Path, name: str) -> Path:
|
||||
path = directory / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"%PDF-1.7\nfast integration fixture\n")
|
||||
return path
|
||||
|
||||
|
||||
def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_counts(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "쉘구조_math.pdf")
|
||||
adapter = FixtureAdapter(
|
||||
raw_markdown=(
|
||||
"# Shell Element\n\n"
|
||||
"Inline \\(u_i\\) and display:\n\n"
|
||||
"\\[\nK u = f\n\\]\n\n"
|
||||
'<table><tr><td rowspan="2">\\(N_i\\)</td><td>stress</td></tr></table>\n\n'
|
||||
"\n"
|
||||
),
|
||||
raw_structured={"pages": [{}, {}, {}]},
|
||||
asset_name="mesh.png",
|
||||
)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.markdown_path.exists()
|
||||
assert result.metadata_path is not None and result.metadata_path.exists()
|
||||
assert result.report_path.exists()
|
||||
assert (tmp_path / "out" / "쉘구조_math.assets" / "mesh.png").read_bytes() == b"fake image"
|
||||
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "$u_i$" in markdown
|
||||
assert "$$\nK u = f\n$$" in markdown
|
||||
assert "" in markdown
|
||||
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert metadata["engine_version"] == "3.1.0"
|
||||
assert metadata["summary"]["pages_processed"] == 3
|
||||
assert metadata["summary"]["asset_count"] == 1
|
||||
assert metadata["summary"]["inline_formula_count"] == 1
|
||||
assert metadata["summary"]["display_formula_count"] == 1
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["summary"]["warning_count"] == 1
|
||||
assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK"
|
||||
assert metadata["assets"] == [{"relative_path": "쉘구조_math.assets/mesh.png"}]
|
||||
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Final status: `partial`" in report
|
||||
assert "- Output Markdown:" in report
|
||||
assert "- Metadata JSON:" in report
|
||||
assert "- Report Markdown:" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "`TABLE_FALLBACK`" in report
|
||||
|
||||
|
||||
def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "failed.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="", succeeded=False)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "failed"
|
||||
assert result.warning_count == 1
|
||||
assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED
|
||||
assert not result.markdown_path.exists()
|
||||
assert not result.report_path.exists()
|
||||
assert result.metadata_path is not None and not result.metadata_path.exists()
|
||||
|
||||
|
||||
def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
first = make_pdf(source, "a.pdf")
|
||||
second = make_pdf(source, "한글.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1})
|
||||
|
||||
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert [call[0] for call in adapter.calls] == [first.resolve(), second.resolve()]
|
||||
assert "converted: 2" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (tmp_path / "out" / "a.md").exists()
|
||||
assert (tmp_path / "out" / "a.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "a.report.md").exists()
|
||||
assert (tmp_path / "out" / "한글.md").exists()
|
||||
assert (tmp_path / "out" / "한글.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "한글.report.md").exists()
|
||||
@@ -0,0 +1,232 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from importlib.metadata import entry_points
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfWriter
|
||||
|
||||
from pdf2md.cli import main
|
||||
from pdf2md.doctor import DoctorCheck, DoctorReport
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult
|
||||
|
||||
|
||||
class FakeAdapter:
|
||||
def __init__(self, *, succeeded: bool = True) -> None:
|
||||
self.succeeded = succeeded
|
||||
self.calls: list[Path] = []
|
||||
self.options: list[object] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append(input_path)
|
||||
self.options.append(options)
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=self.succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=f"# {input_path.stem}\n" if self.succeeded else None,
|
||||
raw_structured={"pages": 1},
|
||||
asset_paths=(),
|
||||
warnings=() if self.succeeded else (warning,),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if self.succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
def fixed_clock() -> datetime:
|
||||
return datetime(2026, 5, 8, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def make_pdf(directory: Path, name: str) -> Path:
|
||||
path = directory / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"%PDF-1.7\n")
|
||||
return path
|
||||
|
||||
|
||||
def make_pdf_with_pages(directory: Path, name: str, page_count: int) -> Path:
|
||||
path = directory / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
writer = PdfWriter()
|
||||
for _ in range(page_count):
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with path.open("wb") as file:
|
||||
writer.write(file)
|
||||
return path
|
||||
|
||||
|
||||
def test_console_script_entry_point_is_reserved() -> None:
|
||||
scripts = {entry_point.name: entry_point for entry_point in entry_points(group="console_scripts")}
|
||||
|
||||
assert scripts["pdf2md"].value == "pdf2md.cli:main"
|
||||
|
||||
|
||||
def test_cli_no_args_prints_help(capsys) -> None:
|
||||
assert main([]) == 0
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "usage: pdf2md" in captured.out
|
||||
assert "convert" in captured.out
|
||||
assert "--no-strict-local" not in captured.out
|
||||
|
||||
|
||||
def test_cli_version_module_execution() -> None:
|
||||
completed = subprocess.run(
|
||||
[sys.executable, "-m", "pdf2md.cli", "--version"],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert completed.returncode == 0
|
||||
assert completed.stdout.strip() == "pdf2md 0.1.0"
|
||||
|
||||
|
||||
def test_cli_doctor_success_returns_zero(capsys) -> None:
|
||||
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("python", "pass", "ok"),)))
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "Doctor status: PASS" in captured.out
|
||||
assert "[PASS] python: ok" in captured.out
|
||||
|
||||
|
||||
def test_cli_doctor_warning_only_returns_zero(capsys) -> None:
|
||||
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("gpu", "warn", "missing"),)))
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "Doctor status: WARN" in captured.out
|
||||
assert "[WARN] gpu: missing" in captured.out
|
||||
|
||||
|
||||
def test_cli_doctor_failure_returns_nonzero(capsys) -> None:
|
||||
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("mineru", "fail", "missing"),)))
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 1
|
||||
assert "Doctor status: FAIL" in captured.out
|
||||
assert "[FAIL] mineru: missing" in captured.out
|
||||
|
||||
|
||||
def test_cli_convert_single_pdf_writes_outputs_and_summary(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
out = tmp_path / "out"
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "converted: 1" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (out / "paper.md").exists()
|
||||
assert (out / "paper.metadata.json").exists()
|
||||
assert (out / "paper.report.md").exists()
|
||||
assert adapter.calls == [pdf.resolve()]
|
||||
assert adapter.options[0].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
|
||||
|
||||
def test_cli_convert_directory_is_deterministic(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
make_pdf(source, "b.pdf")
|
||||
make_pdf(source, "a.pdf")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf"]
|
||||
assert "converted: 2" in captured.out
|
||||
assert captured.out.index("a.pdf") < captured.out.index("b.pdf")
|
||||
|
||||
|
||||
def test_cli_convert_recursive_only_when_requested(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
make_pdf(source, "top.pdf")
|
||||
make_pdf(source / "nested", "child.pdf")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(
|
||||
["convert", str(source), "--out", str(tmp_path / "out"), "--recursive"],
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert [path.name for path in adapter.calls] == ["child.pdf", "top.pdf"]
|
||||
assert "converted: 2" in captured.out
|
||||
assert (tmp_path / "out" / "nested" / "child.md").exists()
|
||||
|
||||
|
||||
def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
adapter = FakeAdapter(succeeded=False)
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 1
|
||||
assert "failed: 1" in captured.out
|
||||
assert "warnings: 1" in captured.out
|
||||
assert not (tmp_path / "out" / "paper.md").exists()
|
||||
|
||||
|
||||
def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
out = tmp_path / "out"
|
||||
out.mkdir()
|
||||
(out / "paper.md").write_text("old", encoding="utf-8")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 2
|
||||
assert "planned outputs already exist" in captured.err
|
||||
assert adapter.calls == []
|
||||
|
||||
|
||||
def test_cli_convert_chunk_pages_flag_uses_default_twenty_pages(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, "long.pdf", 21)
|
||||
out = tmp_path / "out"
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out), "--chunk-pages"], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "converted: 2" in captured.out
|
||||
assert [path.name for path in adapter.calls] == [
|
||||
"long.part-001.pages-001-020.pdf",
|
||||
"long.part-002.pages-021-021.pdf",
|
||||
]
|
||||
assert (out / "long.part-001.pages-001-020.md").exists()
|
||||
assert (out / "long.part-002.pages-021-021.md").exists()
|
||||
|
||||
|
||||
def test_cli_convert_rejects_non_positive_chunk_pages(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
|
||||
with pytest.raises(SystemExit) as error:
|
||||
main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--chunk-pages", "0"])
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert error.value.code == 2
|
||||
assert "must be a positive integer" in captured.err
|
||||
@@ -0,0 +1,418 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfWriter
|
||||
|
||||
import pdf2md.conversion as conversion_module
|
||||
from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
|
||||
from pdf2md.paths import OutputConflictError
|
||||
|
||||
|
||||
class FakeAdapter:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
raw_markdown: str = "# Title\n",
|
||||
raw_structured: object | None = None,
|
||||
succeeded: bool = True,
|
||||
warnings: tuple[WarningRecord, ...] = (),
|
||||
asset_name: str | None = None,
|
||||
) -> None:
|
||||
self.raw_markdown = raw_markdown
|
||||
self.raw_structured = raw_structured
|
||||
self.succeeded = succeeded
|
||||
self.warnings = warnings
|
||||
self.asset_name = asset_name
|
||||
self.calls: list[tuple[Path, Path, object]] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
(output_dir / "raw.log").write_text("raw output", encoding="utf-8")
|
||||
self.calls.append((input_path, output_dir, options))
|
||||
asset_paths: tuple[Path, ...] = ()
|
||||
if self.asset_name is not None:
|
||||
asset_path = output_dir / "assets" / self.asset_name
|
||||
asset_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
asset_path.write_bytes(b"asset")
|
||||
asset_paths = (asset_path,)
|
||||
return MinerUAdapterResult(
|
||||
succeeded=self.succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=self.raw_markdown if self.succeeded else None,
|
||||
raw_structured=self.raw_structured,
|
||||
asset_paths=asset_paths,
|
||||
warnings=self.warnings,
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if self.succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
class SequencedAdapter:
|
||||
def __init__(self, outcomes: tuple[bool, ...]) -> None:
|
||||
self.outcomes = list(outcomes)
|
||||
self.calls: list[Path] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append(input_path)
|
||||
succeeded = self.outcomes.pop(0)
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=succeeded,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=f"# {input_path.stem}\n" if succeeded else None,
|
||||
raw_structured={"pages": 1},
|
||||
asset_paths=(),
|
||||
warnings=() if succeeded else (warning,),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0 if succeeded else 2,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
class NestedMinerUAssetAdapter:
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
asset_path = output_dir / "paper" / "hybrid_auto" / "images" / "fig.png"
|
||||
asset_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
asset_path.write_bytes(b"nested asset")
|
||||
return MinerUAdapterResult(
|
||||
succeeded=True,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown="\n\n\\[x^2\\]\n",
|
||||
raw_structured=[{"page_idx": 0}, {"page_idx": 12}],
|
||||
asset_paths=(asset_path,),
|
||||
warnings=(),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
def fixed_clock() -> datetime:
|
||||
return datetime(2026, 5, 8, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def make_pdf(tmp_path: Path, name: str = "paper.pdf") -> Path:
|
||||
path = tmp_path / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"%PDF-1.7\nlocal fixture\n")
|
||||
return path
|
||||
|
||||
|
||||
def make_pdf_with_pages(tmp_path: Path, page_count: int, name: str = "paper.pdf") -> Path:
|
||||
path = tmp_path / name
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
writer = PdfWriter()
|
||||
for _ in range(page_count):
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with path.open("wb") as file:
|
||||
writer.write(file)
|
||||
return path
|
||||
|
||||
|
||||
def test_convert_pdf_writes_markdown_metadata_report_and_assets(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(
|
||||
raw_markdown="# Title\n\nInline \\(x_i\\)\n\n\n",
|
||||
raw_structured={"pages": [{}, {}]},
|
||||
asset_name="fig.png",
|
||||
)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.final_status == "success"
|
||||
assert result.pages_processed == 2
|
||||
assert result.warning_count == 0
|
||||
assert result.engine == "MinerU"
|
||||
assert result.engine_version == "3.1.0"
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "# Title\n\nInline $x_i$\n\n\n"
|
||||
assert (tmp_path / "out" / "paper.assets" / "fig.png").read_bytes() == b"asset"
|
||||
assert result.report_path.exists()
|
||||
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
|
||||
assert metadata["created_at"] == "2026-05-08T00:00:00Z"
|
||||
assert metadata["summary"]["pages_processed"] == 2
|
||||
assert metadata["summary"]["inline_formula_count"] == 1
|
||||
assert metadata["summary"]["asset_count"] == 1
|
||||
assert metadata["assets"] == [{"relative_path": "paper.assets/fig.png"}]
|
||||
assert "- Final status: `success`" in result.report_path.read_text(encoding="utf-8")
|
||||
assert not adapter.calls[0][1].exists()
|
||||
|
||||
|
||||
def test_convert_pdf_adapter_failure_returns_failed_result_without_fallback_or_outputs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
adapter = FakeAdapter(succeeded=False, warnings=(warning,))
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.succeeded is False
|
||||
assert result.final_status == "failed"
|
||||
assert result.warnings == (warning,)
|
||||
assert len(adapter.calls) == 1
|
||||
assert not result.markdown_path.exists()
|
||||
assert not result.report_path.exists()
|
||||
|
||||
|
||||
def test_convert_pdf_respects_output_conflicts_and_overwrite(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
out = tmp_path / "out"
|
||||
out.mkdir()
|
||||
(out / "paper.md").write_text("old", encoding="utf-8")
|
||||
|
||||
with pytest.raises(OutputConflictError):
|
||||
convert_pdf(pdf, out, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
result = convert_pdf(pdf, out, adapter=FakeAdapter(raw_markdown="new\n"), clock=fixed_clock, overwrite=True)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "new\n"
|
||||
|
||||
|
||||
def test_convert_pdf_can_skip_metadata_json_but_still_writes_report(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", metadata=False, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
assert result.metadata_path is None
|
||||
assert result.markdown_path.exists()
|
||||
assert result.report_path.exists()
|
||||
assert not (tmp_path / "out" / "paper.metadata.json").exists()
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "Metadata JSON:" not in report
|
||||
assert "Report Markdown:" in report
|
||||
|
||||
|
||||
def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: False, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_FAILED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["math_render_error_count"] == 1
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_FAILED"
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Math render error count: 1" in report
|
||||
assert "`MATH_RENDER_FAILED`" in report
|
||||
|
||||
|
||||
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
|
||||
monkeypatch.setattr(conversion_module, "create_default_math_checker", lambda: None)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
|
||||
assert result.warnings[0].severity == WarningSeverity.INFO
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["warning_count"] == 1
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "unavailable" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
|
||||
|
||||
def test_convert_pdf_uses_default_math_checker_when_available(tmp_path: Path, monkeypatch) -> None:
|
||||
class DefaultChecker:
|
||||
def __init__(self) -> None:
|
||||
self.bodies: list[str] = []
|
||||
|
||||
def check_expressions(self, expressions):
|
||||
self.bodies = [expression.body for expression in expressions]
|
||||
return (True,)
|
||||
|
||||
checker = DefaultChecker()
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
|
||||
monkeypatch.setattr(conversion_module, "create_default_math_checker", lambda: checker)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "success"
|
||||
assert result.warning_count == 0
|
||||
assert checker.bodies == ["x"]
|
||||
|
||||
|
||||
def test_convert_pdf_keep_raw_preserves_adapter_work_directory(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", keep_raw=True, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
assert result.raw_dir == tmp_path / "out" / "paper.raw"
|
||||
assert (result.raw_dir / "raw.log").read_text(encoding="utf-8") == "raw output"
|
||||
|
||||
|
||||
def test_convert_pdf_rejects_disabling_strict_local(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
|
||||
with pytest.raises(StrictLocalViolationError):
|
||||
convert_pdf(pdf, tmp_path / "out", strict_local=False, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
|
||||
def test_convert_pdf_passes_gpu_device_to_strict_local_options(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter()
|
||||
|
||||
convert_pdf(pdf, tmp_path / "out", gpu="cuda:0", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
|
||||
|
||||
def test_convert_pdf_defaults_to_cuda_zero(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter()
|
||||
|
||||
convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
|
||||
|
||||
def test_convert_pdf_rewrites_nested_mineru_image_links_and_page_indexes(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
|
||||
result = convert_pdf(
|
||||
pdf,
|
||||
tmp_path / "out",
|
||||
adapter=NestedMinerUAssetAdapter(),
|
||||
math_checker=lambda _: True,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
assert result.final_status == "success"
|
||||
assert result.pages_processed == 13
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "" in markdown
|
||||
assert "](images/fig.png)" not in markdown
|
||||
copied_asset = tmp_path / "out" / "paper.assets" / "paper" / "hybrid_auto" / "images" / "fig.png"
|
||||
assert copied_asset.read_bytes() == b"nested asset"
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["pages_processed"] == 13
|
||||
assert metadata["summary"]["warning_count"] == 0
|
||||
|
||||
|
||||
def test_convert_input_batch_continues_after_per_file_failure(tmp_path: Path) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
make_pdf(source, "a.pdf")
|
||||
make_pdf(source, "b.pdf")
|
||||
make_pdf(source, "c.pdf")
|
||||
adapter = SequencedAdapter((True, False, True))
|
||||
|
||||
batch = convert_input(source, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf", "c.pdf"]
|
||||
assert batch.converted_count == 2
|
||||
assert batch.failed_count == 1
|
||||
assert (tmp_path / "out" / "a.md").exists()
|
||||
assert not (tmp_path / "out" / "b.md").exists()
|
||||
assert (tmp_path / "out" / "c.md").exists()
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_returns_batch_and_deletes_temporary_chunk_pdfs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 41, "thesis.pdf")
|
||||
adapter = FakeAdapter(raw_structured={"pages": 1})
|
||||
|
||||
batch = convert_pdf(
|
||||
pdf,
|
||||
tmp_path / "out",
|
||||
adapter=adapter,
|
||||
math_checker=lambda _: True,
|
||||
chunk_pages=20,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
assert isinstance(batch, BatchConversionResult)
|
||||
assert batch.converted_count == 3
|
||||
assert [result.markdown_path.name for result in batch.results] == [
|
||||
"thesis.part-001.pages-001-020.md",
|
||||
"thesis.part-002.pages-021-040.md",
|
||||
"thesis.part-003.pages-041-041.md",
|
||||
]
|
||||
assert [path.name for path, _, _ in adapter.calls] == [
|
||||
"thesis.part-001.pages-001-020.pdf",
|
||||
"thesis.part-002.pages-021-040.pdf",
|
||||
"thesis.part-003.pages-041-041.pdf",
|
||||
]
|
||||
assert all(result.source_pdf == pdf.resolve() for result in batch.results)
|
||||
assert all(not path.exists() for path, _, _ in adapter.calls)
|
||||
|
||||
metadata = json.loads((tmp_path / "out" / "thesis.part-002.pages-021-040.metadata.json").read_text(encoding="utf-8"))
|
||||
assert metadata["source_pdf"] == str(pdf.resolve())
|
||||
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
|
||||
assert metadata["engine_options"]["chunk"] == {
|
||||
"chunk_index": 2,
|
||||
"chunk_page_count": 20,
|
||||
"chunk_pdf_name": "thesis.part-002.pages-021-040.pdf",
|
||||
"original_source_pdf": str(pdf.resolve()),
|
||||
"source_page_end": 40,
|
||||
"source_page_start": 21,
|
||||
"total_chunks": 3,
|
||||
}
|
||||
report = (tmp_path / "out" / "thesis.part-002.pages-021-040.report.md").read_text(encoding="utf-8")
|
||||
assert "- Chunk: 2/3, source pages: 21-40" in report
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_keeps_short_pdf_as_single_batch_result(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 3, "short.pdf")
|
||||
adapter = FakeAdapter(raw_structured={"pages": 3})
|
||||
|
||||
batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
|
||||
|
||||
assert isinstance(batch, BatchConversionResult)
|
||||
assert batch.converted_count == 1
|
||||
assert batch.results[0].markdown_path.name == "short.md"
|
||||
assert adapter.calls[0][0] == pdf.resolve()
|
||||
assert adapter.calls[0][0].exists()
|
||||
|
||||
|
||||
def test_convert_input_chunk_mode_continues_after_failed_chunk(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 41, "paper.pdf")
|
||||
adapter = SequencedAdapter((True, False, True))
|
||||
|
||||
batch = convert_input(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
|
||||
|
||||
assert batch.converted_count == 2
|
||||
assert batch.failed_count == 1
|
||||
assert [path.name for path in adapter.calls] == [
|
||||
"paper.part-001.pages-001-020.pdf",
|
||||
"paper.part-002.pages-021-040.pdf",
|
||||
"paper.part-003.pages-041-041.pdf",
|
||||
]
|
||||
assert (tmp_path / "out" / "paper.part-001.pages-001-020.md").exists()
|
||||
assert not (tmp_path / "out" / "paper.part-002.pages-021-040.md").exists()
|
||||
assert (tmp_path / "out" / "paper.part-003.pages-041-041.md").exists()
|
||||
@@ -0,0 +1,311 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.doctor import DoctorCommandResult, DoctorReport, format_doctor_report, run_doctor
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.math_render import default_mathjax_helper_path
|
||||
from pdf2md.mineru_adapter import MinerUVersionResult
|
||||
|
||||
|
||||
class FakeMinerUProbe:
|
||||
def __init__(self, result: MinerUVersionResult) -> None:
|
||||
self.result = result
|
||||
|
||||
def version(self) -> MinerUVersionResult:
|
||||
return self.result
|
||||
|
||||
|
||||
class FakeCuda:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
available: bool = True,
|
||||
devices: tuple[str, ...] = ("NVIDIA RTX 4060",),
|
||||
capabilities: tuple[tuple[int, int], ...] = ((8, 9),),
|
||||
) -> None:
|
||||
self._available = available
|
||||
self._devices = devices
|
||||
self._capabilities = capabilities
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return self._available
|
||||
|
||||
def device_count(self) -> int:
|
||||
return len(self._devices)
|
||||
|
||||
def get_device_name(self, index: int) -> str:
|
||||
return self._devices[index]
|
||||
|
||||
def get_device_capability(self, index: int) -> tuple[int, int]:
|
||||
return self._capabilities[index]
|
||||
|
||||
|
||||
class FakeTorchVersion:
|
||||
cuda = "12.8"
|
||||
|
||||
|
||||
class FakeTorch:
|
||||
__version__ = "2.8.0+cu128"
|
||||
version = FakeTorchVersion()
|
||||
|
||||
def __init__(self, cuda: FakeCuda) -> None:
|
||||
self.cuda = cuda
|
||||
|
||||
|
||||
def test_doctor_all_checks_pass_with_mocked_tools(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
env={"HF_HOME": str(tmp_path / "hf")},
|
||||
existing_paths={tmp_path / "hf"},
|
||||
)
|
||||
|
||||
assert report.status == "pass"
|
||||
assert report.exit_code == 0
|
||||
assert [check.name for check in report.checks] == [
|
||||
"python",
|
||||
"uv",
|
||||
"mineru",
|
||||
"gpu",
|
||||
"pytorch",
|
||||
"models",
|
||||
"mathjax",
|
||||
"local-only",
|
||||
]
|
||||
|
||||
|
||||
def test_doctor_fails_outside_python_312(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, python_version=(3, 11, 9))
|
||||
|
||||
python_check = find_check(report, "python")
|
||||
assert report.status == "fail"
|
||||
assert python_check.status == "fail"
|
||||
assert "use Python 3.12.x" in python_check.message
|
||||
|
||||
|
||||
def test_doctor_fails_when_uv_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, available_tools={"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe"})
|
||||
|
||||
uv_check = find_check(report, "uv")
|
||||
assert report.status == "fail"
|
||||
assert uv_check.status == "fail"
|
||||
assert "uv executable was not found" in uv_check.message
|
||||
|
||||
|
||||
def test_doctor_fails_when_mineru_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=False,
|
||||
version=None,
|
||||
command=("mineru", "--version"),
|
||||
exit_code=None,
|
||||
stdout="",
|
||||
stderr="",
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "fail"
|
||||
assert report.exit_code == 1
|
||||
assert mineru_check.status == "fail"
|
||||
assert "MinerU CLI executable was not found" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mineru_version_command_fails(tmp_path: Path) -> None:
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU version command failed.")
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=True,
|
||||
version=None,
|
||||
command=("mineru", "--version"),
|
||||
exit_code=2,
|
||||
stdout="",
|
||||
stderr="boom",
|
||||
warnings=(warning,),
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "warn"
|
||||
assert mineru_check.status == "warn"
|
||||
assert "version could not be detected" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mineru_version_is_not_target(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
mineru_result=MinerUVersionResult(
|
||||
available=True,
|
||||
version="mineru, version 3.1.8",
|
||||
command=("mineru", "--version"),
|
||||
exit_code=0,
|
||||
stdout="mineru, version 3.1.8",
|
||||
stderr="",
|
||||
),
|
||||
)
|
||||
|
||||
mineru_check = find_check(report, "mineru")
|
||||
assert report.status == "warn"
|
||||
assert mineru_check.status == "warn"
|
||||
assert "project target is 3.1.0" in mineru_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_gpu_and_pytorch_are_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
available_tools={"uv": "C:/Users/user/.local/bin/uv.exe"},
|
||||
import_module=missing_torch,
|
||||
)
|
||||
|
||||
assert report.status == "warn"
|
||||
assert find_check(report, "gpu").status == "warn"
|
||||
assert find_check(report, "pytorch").status == "warn"
|
||||
|
||||
|
||||
def test_doctor_warns_for_gtx_1070_ti_pascal_risk(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
|
||||
|
||||
gpu_check = find_check(report, "gpu")
|
||||
assert report.status == "warn"
|
||||
assert gpu_check.status == "warn"
|
||||
assert "Pascal/pre-Turing compatibility risk" in gpu_check.message
|
||||
assert any("GTX 1070 Ti" in detail for detail in gpu_check.details)
|
||||
|
||||
|
||||
def test_doctor_warns_for_pytorch_pre_turing_capability(tmp_path: Path) -> None:
|
||||
def fake_pascal_torch(name: str) -> FakeTorch:
|
||||
assert name == "torch"
|
||||
return FakeTorch(FakeCuda(devices=("NVIDIA GeForce GTX 1070 Ti",), capabilities=((6, 1),)))
|
||||
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
gpu_stdout="NVIDIA RTX 4060, 8192 MiB, 551.86\n",
|
||||
import_module=fake_pascal_torch,
|
||||
)
|
||||
|
||||
pytorch_check = find_check(report, "pytorch")
|
||||
assert report.status == "warn"
|
||||
assert pytorch_check.status == "warn"
|
||||
assert "Pascal/pre-Turing compatibility risk" in pytorch_check.message
|
||||
assert any("compute capability 6.1" in detail for detail in pytorch_check.details)
|
||||
|
||||
|
||||
def test_doctor_warns_when_model_cache_is_not_detected(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, env={}, existing_paths=set())
|
||||
|
||||
models_check = find_check(report, "models")
|
||||
assert report.status == "warn"
|
||||
assert models_check.status == "warn"
|
||||
assert "No MinerU model/cache/config path" in models_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mathjax_node_is_missing(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
available_tools={
|
||||
"uv": "C:/Users/user/.local/bin/uv.exe",
|
||||
"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe",
|
||||
},
|
||||
)
|
||||
|
||||
mathjax_check = find_check(report, "mathjax")
|
||||
assert report.status == "warn"
|
||||
assert mathjax_check.status == "warn"
|
||||
assert "Node.js executable was not found" in mathjax_check.message
|
||||
|
||||
|
||||
def test_doctor_warns_when_mathjax_health_fails(tmp_path: Path) -> None:
|
||||
def failing_runner(command: tuple[str, ...]) -> DoctorCommandResult:
|
||||
if command[-1] == "--health":
|
||||
return DoctorCommandResult(command, 1, stderr="Cannot find package 'mathjax'")
|
||||
return command_runner("NVIDIA RTX 4060, 8192 MiB, 551.86\n")(command)
|
||||
|
||||
report = make_report(tmp_path, run_command=failing_runner)
|
||||
|
||||
mathjax_check = find_check(report, "mathjax")
|
||||
assert report.status == "warn"
|
||||
assert mathjax_check.status == "warn"
|
||||
assert "unavailable" in mathjax_check.message
|
||||
assert any("mathjax" in detail for detail in mathjax_check.details)
|
||||
|
||||
|
||||
def test_format_doctor_report_is_stable(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
|
||||
|
||||
formatted = format_doctor_report(report)
|
||||
|
||||
assert formatted.startswith("Doctor status: WARN\n")
|
||||
assert "[WARN] gpu:" in formatted
|
||||
assert "[PASS] local-only:" in formatted
|
||||
|
||||
|
||||
def make_report(
|
||||
tmp_path: Path,
|
||||
*,
|
||||
python_version: tuple[int, int, int] = (3, 12, 7),
|
||||
available_tools: dict[str, str] | None = None,
|
||||
mineru_result: MinerUVersionResult | None = None,
|
||||
gpu_stdout: str = "NVIDIA RTX 4060, 8192 MiB, 551.86\n",
|
||||
env: dict[str, str] | None = None,
|
||||
existing_paths: set[Path] | None = None,
|
||||
import_module=None,
|
||||
run_command=None,
|
||||
) -> DoctorReport:
|
||||
tools = available_tools or {
|
||||
"uv": "C:/Users/user/.local/bin/uv.exe",
|
||||
"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe",
|
||||
"node": "C:/Program Files/nodejs/node.exe",
|
||||
}
|
||||
result = mineru_result or MinerUVersionResult(
|
||||
available=True,
|
||||
version="mineru, version 3.1.0",
|
||||
command=("mineru", "--version"),
|
||||
exit_code=0,
|
||||
stdout="mineru, version 3.1.0",
|
||||
stderr="",
|
||||
)
|
||||
environment = env if env is not None else {"HF_HOME": str(tmp_path / "hf")}
|
||||
paths = set(existing_paths if existing_paths is not None else {tmp_path / "hf"})
|
||||
paths.add(default_mathjax_helper_path())
|
||||
|
||||
return run_doctor(
|
||||
python_version=python_version,
|
||||
which=lambda executable: tools.get(executable),
|
||||
run_command=run_command or command_runner(gpu_stdout),
|
||||
import_module=import_module or fake_torch,
|
||||
env=environment,
|
||||
path_exists=lambda path: path in paths,
|
||||
home=tmp_path,
|
||||
mineru_probe=FakeMinerUProbe(result),
|
||||
)
|
||||
|
||||
|
||||
def command_runner(gpu_stdout: str):
|
||||
def run(command: tuple[str, ...]) -> DoctorCommandResult:
|
||||
if command == ("uv", "--version"):
|
||||
return DoctorCommandResult(command, 0, stdout="uv 0.8.13\n")
|
||||
if command and command[0] == "nvidia-smi":
|
||||
return DoctorCommandResult(command, 0, stdout=gpu_stdout)
|
||||
if len(command) == 2 and command[1] == "--version" and command[0].endswith("node.exe"):
|
||||
return DoctorCommandResult(command, 0, stdout="v24.13.0\n")
|
||||
if command and command[-1] == "--health":
|
||||
return DoctorCommandResult(command, 0, stdout='{"ok":true}\n')
|
||||
return DoctorCommandResult(command, 127, stderr="not found")
|
||||
|
||||
return run
|
||||
|
||||
|
||||
def fake_torch(name: str) -> FakeTorch:
|
||||
assert name == "torch"
|
||||
return FakeTorch(FakeCuda())
|
||||
|
||||
|
||||
def missing_torch(name: str):
|
||||
assert name == "torch"
|
||||
raise ImportError(name)
|
||||
|
||||
|
||||
def find_check(report: DoctorReport, name: str):
|
||||
return next(check for check in report.checks if check.name == name)
|
||||
@@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import (
|
||||
AssetRecord,
|
||||
BlockRecord,
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
)
|
||||
|
||||
|
||||
def test_record_serialization_preserves_present_optional_fields(tmp_path: Path) -> None:
|
||||
block = BlockRecord(
|
||||
BlockType.INLINE_FORMULA,
|
||||
page_index=1,
|
||||
bbox=(1.0, 2.0, 3.0, 4.0),
|
||||
confidence=0.92,
|
||||
markdown_span=(10, 20),
|
||||
)
|
||||
page = PageRecord(page_index=1, width=612, height=792, blocks=(block,))
|
||||
asset = AssetRecord("paper.assets/image.png", page_index=1, bbox=(5.0, 6.0, 7.0, 8.0))
|
||||
warning = WarningRecord(
|
||||
WarningCode.LOW_CONFIDENCE_FORMULA,
|
||||
WarningSeverity.WARNING,
|
||||
"Formula confidence is low.",
|
||||
page_index=1,
|
||||
bbox=(1.0, 2.0, 3.0, 4.0),
|
||||
)
|
||||
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,), assets=(asset,), warnings=(warning,))
|
||||
|
||||
data = document.to_dict()
|
||||
|
||||
assert data["source_pdf"] == str(tmp_path / "paper.pdf")
|
||||
assert data["pages"][0]["width"] == 612
|
||||
assert data["pages"][0]["height"] == 792
|
||||
assert data["pages"][0]["blocks"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
||||
assert data["pages"][0]["blocks"][0]["confidence"] == 0.92
|
||||
assert data["pages"][0]["blocks"][0]["markdown_span"] == [10, 20]
|
||||
assert data["assets"][0]["bbox"] == [5.0, 6.0, 7.0, 8.0]
|
||||
assert data["warnings"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
||||
json.dumps(data)
|
||||
|
||||
|
||||
def test_record_serialization_omits_absent_optional_fields(tmp_path: Path) -> None:
|
||||
block = BlockRecord(BlockType.PARAGRAPH)
|
||||
page = PageRecord(page_index=0, blocks=(block,))
|
||||
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,))
|
||||
|
||||
block_data = document.to_dict()["pages"][0]["blocks"][0]
|
||||
page_data = document.to_dict()["pages"][0]
|
||||
|
||||
assert "page_index" not in block_data
|
||||
assert "bbox" not in block_data
|
||||
assert "confidence" not in block_data
|
||||
assert "markdown_span" not in block_data
|
||||
assert "width" not in page_data
|
||||
assert "height" not in page_data
|
||||
|
||||
|
||||
def test_block_types_and_warning_codes_match_architecture_set() -> None:
|
||||
assert {item.value for item in BlockType} == {
|
||||
"heading",
|
||||
"paragraph",
|
||||
"inline_formula",
|
||||
"display_formula",
|
||||
"table",
|
||||
"figure",
|
||||
"caption",
|
||||
"footnote",
|
||||
"reference",
|
||||
"unknown",
|
||||
}
|
||||
assert {item.value for item in WarningCode} >= {
|
||||
"ENGINE_MISSING",
|
||||
"GPU_UNAVAILABLE",
|
||||
"LOW_CONFIDENCE_FORMULA",
|
||||
"MATH_RENDER_FAILED",
|
||||
"ASSET_LINK_MISSING",
|
||||
"READING_ORDER_UNCERTAIN",
|
||||
"STRICT_LOCAL_VIOLATION",
|
||||
"MINERU_CLI_FAILED",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_block_type", ["formula", "image"])
|
||||
def test_invalid_block_type_fails_predictably(invalid_block_type: str) -> None:
|
||||
with pytest.raises(ValueError, match="invalid block_type"):
|
||||
BlockRecord(invalid_block_type) # type: ignore[arg-type]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_code", ["REMOTE_API_USED", "UNKNOWN_WARNING"])
|
||||
def test_invalid_warning_code_fails_predictably(invalid_code: str) -> None:
|
||||
with pytest.raises(ValueError, match="invalid code"):
|
||||
WarningRecord(invalid_code, WarningSeverity.WARNING, "message") # type: ignore[arg-type]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_severity", ["fatal", "warn"])
|
||||
def test_invalid_warning_severity_fails_predictably(invalid_severity: str) -> None:
|
||||
with pytest.raises(ValueError, match="invalid severity"):
|
||||
WarningRecord(WarningCode.MATH_RENDER_FAILED, invalid_severity, "message") # type: ignore[arg-type]
|
||||
|
||||
|
||||
def test_empty_pages_are_rejected(tmp_path: Path) -> None:
|
||||
with pytest.raises(ValueError, match="at least one page"):
|
||||
DocumentRecord(tmp_path / "paper.pdf", pages=())
|
||||
|
||||
|
||||
def test_empty_source_pdf_is_rejected() -> None:
|
||||
with pytest.raises(ValueError, match="source_pdf"):
|
||||
DocumentRecord("", pages=(PageRecord(page_index=0),))
|
||||
|
||||
|
||||
def test_invalid_optional_fields_are_rejected() -> None:
|
||||
with pytest.raises(ValueError, match="page_index"):
|
||||
BlockRecord(BlockType.PARAGRAPH, page_index=-1)
|
||||
with pytest.raises(ValueError, match="bbox"):
|
||||
BlockRecord(BlockType.PARAGRAPH, bbox=(1.0, 2.0, 3.0)) # type: ignore[arg-type]
|
||||
with pytest.raises(ValueError, match="confidence"):
|
||||
BlockRecord(BlockType.PARAGRAPH, confidence=1.2)
|
||||
with pytest.raises(ValueError, match="markdown_span"):
|
||||
BlockRecord(BlockType.PARAGRAPH, markdown_span=(5, 3))
|
||||
|
||||
|
||||
def test_asset_paths_must_be_relative() -> None:
|
||||
with pytest.raises(ValueError, match="relative"):
|
||||
AssetRecord("/absolute/image.png")
|
||||
with pytest.raises(ValueError, match="relative"):
|
||||
AssetRecord("../outside.png")
|
||||
@@ -0,0 +1,159 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.markdown import normalize_markdown
|
||||
|
||||
|
||||
def test_inline_parentheses_math_becomes_obsidian_dollars() -> None:
|
||||
result = normalize_markdown(r"Area is \(x_i^2 + y^{2}\).")
|
||||
|
||||
assert result.markdown == r"Area is $x_i^2 + y^{2}$."
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_existing_dollar_math_and_currency_are_not_rewritten() -> None:
|
||||
source = r"Cost is $5 and $10, while math $x_i^2$ stays."
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
|
||||
|
||||
def test_display_bracket_math_gets_own_delimiter_lines_and_blank_lines() -> None:
|
||||
result = normalize_markdown("Before\n\\[\na_i^2 + b^2\n\\]\nAfter")
|
||||
|
||||
assert result.markdown == "Before\n\n$$\na_i^2 + b^2\n$$\n\nAfter"
|
||||
|
||||
|
||||
def test_display_environment_body_is_preserved_inside_delimiters() -> None:
|
||||
source = "\\[\\begin{align}\na_i &= b^2\n\\end{align}\\]"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == "$$\n\\begin{align}\na_i &= b^2\n\\end{align}\n$$"
|
||||
|
||||
|
||||
def test_existing_display_math_spacing_is_idempotent() -> None:
|
||||
source = "Before\n$$\nx_i^2\n$$\nAfter"
|
||||
|
||||
once = normalize_markdown(source).markdown
|
||||
twice = normalize_markdown(once).markdown
|
||||
|
||||
assert once == "Before\n\n$$\nx_i^2\n$$\n\nAfter"
|
||||
assert twice == once
|
||||
|
||||
|
||||
def test_underscores_carets_braces_and_backslashes_inside_math_are_preserved() -> None:
|
||||
source = r"\(\frac{x_i^{2}}{\alpha_beta}\)"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == r"$\frac{x_i^{2}}{\alpha_beta}$"
|
||||
|
||||
|
||||
def test_fenced_code_blocks_are_not_normalized() -> None:
|
||||
source = "Text\n```md\n\\(x_i\\)\n\\[y\\]\n\n```\n\\(z\\)"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == "Text\n```md\n\\(x_i\\)\n\\[y\\]\n\n```\n$z$"
|
||||
|
||||
|
||||
def test_inline_code_spans_are_not_normalized() -> None:
|
||||
source = r"Keep `\(x_i\)` and convert \(y_i\)."
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == r"Keep `\(x_i\)` and convert $y_i$."
|
||||
|
||||
|
||||
def test_normalization_is_idempotent_for_mixed_content(tmp_path: Path) -> None:
|
||||
(tmp_path / "assets").mkdir()
|
||||
(tmp_path / "assets" / "fig 1.png").write_bytes(b"image")
|
||||
source = "Before \\(x_i\\)\n\\[y^2\\]\n"
|
||||
|
||||
once = normalize_markdown(source, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
|
||||
twice = normalize_markdown(once.markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
|
||||
|
||||
assert twice.markdown == once.markdown
|
||||
assert twice.warnings == once.warnings
|
||||
|
||||
|
||||
def test_relative_asset_links_use_posix_paths_and_preserve_alt_text() -> None:
|
||||
result = normalize_markdown(r"")
|
||||
|
||||
assert result.markdown == ""
|
||||
assert result.asset_links == ("assets/fig 1.png",)
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_missing_asset_link_emits_warning_when_checking_is_enabled(tmp_path: Path) -> None:
|
||||
(tmp_path / "assets").mkdir()
|
||||
|
||||
result = normalize_markdown(
|
||||
"",
|
||||
markdown_dir=tmp_path,
|
||||
asset_root=tmp_path / "assets",
|
||||
check_assets=True,
|
||||
)
|
||||
|
||||
assert result.markdown == ""
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("source", "expected_link"),
|
||||
[
|
||||
(r"", "fig.png"),
|
||||
("", "outside.png"),
|
||||
],
|
||||
)
|
||||
def test_invalid_local_asset_links_are_rewritten_as_relative_with_warning(source: str, expected_link: str) -> None:
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown.endswith(f"({expected_link})")
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_remote_asset_link_is_warned_and_not_fetched_or_rewritten() -> None:
|
||||
source = ""
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert result.asset_links == ("https://example.test/fig.png",)
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_absolute_asset_under_markdown_dir_can_be_rewritten_relative_with_warning(tmp_path: Path) -> None:
|
||||
asset_dir = tmp_path / "assets"
|
||||
asset_dir.mkdir()
|
||||
asset = asset_dir / "fig.png"
|
||||
asset.write_bytes(b"image")
|
||||
|
||||
result = normalize_markdown(f"", markdown_dir=tmp_path, asset_root=asset_dir, check_assets=True)
|
||||
|
||||
assert result.markdown == ""
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
|
||||
|
||||
|
||||
def test_simple_pipe_table_is_preserved() -> None:
|
||||
source = "| A | B |\n|---|---|\n| \\(x\\) | y |"
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_complex_html_table_is_preserved_with_fallback_warning() -> None:
|
||||
source = '<table><tr><td rowspan="2">\\(x_i\\)</td><td>y</td></tr></table>'
|
||||
|
||||
result = normalize_markdown(source)
|
||||
|
||||
assert result.markdown == source
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.TABLE_FALLBACK]
|
||||
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.math_render import MathJaxCommandResult, MathJaxRenderChecker
|
||||
from pdf2md.quality import MathCheckerUnavailable, MathExpression
|
||||
|
||||
|
||||
def test_mathjax_checker_batches_expressions_as_json(tmp_path: Path) -> None:
|
||||
helper = make_helper(tmp_path)
|
||||
calls = []
|
||||
|
||||
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
calls.append((command, json.loads(stdin), timeout_seconds))
|
||||
return MathJaxCommandResult(
|
||||
command,
|
||||
0,
|
||||
stdout=json.dumps(
|
||||
{
|
||||
"results": [
|
||||
{"index": 0, "ok": True},
|
||||
{"index": 1, "ok": False, "message": "Undefined control sequence"},
|
||||
]
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
checker = MathJaxRenderChecker(
|
||||
helper_path=helper,
|
||||
which=lambda executable: "C:/node/node.exe" if executable == "node" else None,
|
||||
runner=runner,
|
||||
timeout_seconds=7,
|
||||
)
|
||||
expressions = (
|
||||
MathExpression(0, "x_i^2", False, (0, 7)),
|
||||
MathExpression(1, "\\bad", True, (9, 18)),
|
||||
)
|
||||
|
||||
results = checker.check_expressions(expressions)
|
||||
|
||||
assert [result.ok for result in results] == [True, False]
|
||||
assert results[1].message == "Undefined control sequence"
|
||||
assert calls == [
|
||||
(
|
||||
("C:/node/node.exe", str(helper)),
|
||||
{
|
||||
"expressions": [
|
||||
{"index": 0, "body": "x_i^2", "display": False},
|
||||
{"index": 1, "body": "\\bad", "display": True},
|
||||
]
|
||||
},
|
||||
7,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_mathjax_checker_reports_missing_node_as_unavailable(tmp_path: Path) -> None:
|
||||
checker = MathJaxRenderChecker(helper_path=make_helper(tmp_path), which=lambda _: None)
|
||||
|
||||
with pytest.raises(MathCheckerUnavailable, match="Node.js"):
|
||||
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
|
||||
|
||||
|
||||
def test_mathjax_checker_reports_helper_failure_as_unavailable(tmp_path: Path) -> None:
|
||||
helper = make_helper(tmp_path)
|
||||
|
||||
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
return MathJaxCommandResult(command, 124, stderr="MathJax helper timed out")
|
||||
|
||||
checker = MathJaxRenderChecker(
|
||||
helper_path=helper,
|
||||
which=lambda _: "node",
|
||||
runner=runner,
|
||||
)
|
||||
|
||||
with pytest.raises(MathCheckerUnavailable, match="timed out"):
|
||||
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
|
||||
|
||||
|
||||
def test_mathjax_checker_reports_invalid_json_as_unavailable(tmp_path: Path) -> None:
|
||||
helper = make_helper(tmp_path)
|
||||
|
||||
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
return MathJaxCommandResult(command, 0, stdout="not json")
|
||||
|
||||
checker = MathJaxRenderChecker(
|
||||
helper_path=helper,
|
||||
which=lambda _: "node",
|
||||
runner=runner,
|
||||
)
|
||||
|
||||
with pytest.raises(MathCheckerUnavailable, match="invalid JSON"):
|
||||
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
|
||||
|
||||
|
||||
def test_mathjax_checker_rejects_mismatched_result_indexes(tmp_path: Path) -> None:
|
||||
helper = make_helper(tmp_path)
|
||||
|
||||
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
|
||||
return MathJaxCommandResult(command, 0, stdout=json.dumps({"results": [{"index": 99, "ok": True}]}))
|
||||
|
||||
checker = MathJaxRenderChecker(
|
||||
helper_path=helper,
|
||||
which=lambda _: "node",
|
||||
runner=runner,
|
||||
)
|
||||
|
||||
with pytest.raises(MathCheckerUnavailable, match="indexes"):
|
||||
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
|
||||
|
||||
|
||||
def make_helper(tmp_path: Path) -> Path:
|
||||
helper = tmp_path / "check.mjs"
|
||||
helper.write_text("// fake helper", encoding="utf-8")
|
||||
return helper
|
||||
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import (
|
||||
AssetRecord,
|
||||
BlockRecord,
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
)
|
||||
from pdf2md.metadata import MetadataInputError, build_metadata, build_summary
|
||||
|
||||
|
||||
def make_document(tmp_path: Path) -> DocumentRecord:
|
||||
page_zero = PageRecord(
|
||||
page_index=0,
|
||||
blocks=(
|
||||
BlockRecord(BlockType.HEADING, page_index=0),
|
||||
BlockRecord(BlockType.INLINE_FORMULA, page_index=0, confidence=0.98),
|
||||
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, bbox=(1.0, 2.0, 3.0, 4.0)),
|
||||
),
|
||||
)
|
||||
page_one = PageRecord(
|
||||
page_index=1,
|
||||
blocks=(
|
||||
BlockRecord(BlockType.PARAGRAPH, page_index=1),
|
||||
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=1),
|
||||
),
|
||||
)
|
||||
return DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(page_zero, page_one),
|
||||
assets=(AssetRecord("paper.assets/figure.png", page_index=1),),
|
||||
warnings=(
|
||||
WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Check reading order.", page_index=1),
|
||||
WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Math failed to render.", page_index=0),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def build_test_metadata(tmp_path: Path) -> dict[str, object]:
|
||||
return build_metadata(
|
||||
document=make_document(tmp_path),
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-07T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options={"strict_local": True},
|
||||
)
|
||||
|
||||
|
||||
def test_metadata_has_required_top_level_fields(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
assert set(metadata) == {
|
||||
"source_pdf",
|
||||
"source_sha256",
|
||||
"created_at",
|
||||
"engine",
|
||||
"engine_version",
|
||||
"engine_options",
|
||||
"pages",
|
||||
"assets",
|
||||
"warnings",
|
||||
"summary",
|
||||
}
|
||||
|
||||
|
||||
def test_metadata_summary_counts_from_records(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
assert metadata["summary"] == {
|
||||
"pages_processed": 2,
|
||||
"warning_count": 2,
|
||||
"asset_count": 1,
|
||||
"display_formula_count": 2,
|
||||
"inline_formula_count": 1,
|
||||
"math_render_error_count": 1,
|
||||
}
|
||||
|
||||
|
||||
def test_warning_order_and_page_provenance_are_preserved(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
|
||||
warnings = metadata["warnings"]
|
||||
assert [warning["code"] for warning in warnings] == [
|
||||
"READING_ORDER_UNCERTAIN",
|
||||
"MATH_RENDER_FAILED",
|
||||
]
|
||||
assert warnings[0]["page_index"] == 1
|
||||
assert warnings[1]["page_index"] == 0
|
||||
|
||||
|
||||
def test_optional_bbox_and_confidence_are_preserved_only_when_present(tmp_path: Path) -> None:
|
||||
metadata = build_test_metadata(tmp_path)
|
||||
blocks = metadata["pages"][0]["blocks"]
|
||||
|
||||
assert "confidence" not in blocks[0]
|
||||
assert blocks[1]["confidence"] == 0.98
|
||||
assert "bbox" not in blocks[1]
|
||||
assert blocks[2]["bbox"] == [1.0, 2.0, 3.0, 4.0]
|
||||
|
||||
|
||||
def test_metadata_is_json_serializable(tmp_path: Path) -> None:
|
||||
json.dumps(build_test_metadata(tmp_path))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("field_name", "kwargs"),
|
||||
[
|
||||
("document", {"document": None}),
|
||||
("source_sha256", {"source_sha256": ""}),
|
||||
("created_at", {"created_at": ""}),
|
||||
("engine", {"engine": ""}),
|
||||
("engine_version", {"engine_version": ""}),
|
||||
],
|
||||
)
|
||||
def test_metadata_requires_core_inputs(tmp_path: Path, field_name: str, kwargs: dict[str, object]) -> None:
|
||||
values: dict[str, object] = {
|
||||
"document": make_document(tmp_path),
|
||||
"source_sha256": "0" * 64,
|
||||
"created_at": "2026-05-07T00:00:00Z",
|
||||
"engine": "MinerU",
|
||||
"engine_version": "3.1.0",
|
||||
}
|
||||
values.update(kwargs)
|
||||
|
||||
with pytest.raises(MetadataInputError, match=field_name):
|
||||
build_metadata(**values)
|
||||
|
||||
|
||||
def test_engine_options_must_be_json_serializable(tmp_path: Path) -> None:
|
||||
with pytest.raises(MetadataInputError, match="JSON serializable"):
|
||||
build_metadata(
|
||||
document=make_document(tmp_path),
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-07T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options={"path": tmp_path},
|
||||
)
|
||||
|
||||
|
||||
def test_formula_counts_come_from_block_types_not_markdown_text(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH), BlockRecord(BlockType.UNKNOWN))),),
|
||||
)
|
||||
|
||||
summary = build_summary(document)
|
||||
|
||||
assert summary["inline_formula_count"] == 0
|
||||
assert summary["display_formula_count"] == 0
|
||||
|
||||
|
||||
def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.INLINE_FORMULA),)),),
|
||||
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.INFO, "Checker unavailable."),),
|
||||
)
|
||||
|
||||
summary = build_summary(document)
|
||||
|
||||
assert summary["warning_count"] == 1
|
||||
assert summary["math_render_error_count"] == 0
|
||||
@@ -0,0 +1,264 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.mineru_adapter import (
|
||||
CommandResult,
|
||||
MinerUAdapter,
|
||||
MinerUOptions,
|
||||
StrictLocalViolationError,
|
||||
)
|
||||
|
||||
|
||||
class FakeRunner:
|
||||
def __init__(self, *results: CommandResult) -> None:
|
||||
self.results = list(results)
|
||||
self.commands: list[tuple[str, ...]] = []
|
||||
|
||||
def __call__(self, command: tuple[str, ...]) -> CommandResult:
|
||||
self.commands.append(command)
|
||||
if not self.results:
|
||||
raise AssertionError("fake runner was called without a queued result")
|
||||
result = self.results.pop(0)
|
||||
return CommandResult(
|
||||
command=command,
|
||||
exit_code=result.exit_code,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
)
|
||||
|
||||
|
||||
class EnvironmentRunner:
|
||||
def __init__(self) -> None:
|
||||
self.mineru_device_mode: str | None = None
|
||||
self.cuda_visible_devices: str | None = None
|
||||
|
||||
def __call__(self, command: tuple[str, ...]) -> CommandResult:
|
||||
self.mineru_device_mode = os.environ.get("MINERU_DEVICE_MODE")
|
||||
self.cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
||||
work_dir = Path(command[command.index("-o") + 1])
|
||||
work_dir.mkdir(parents=True, exist_ok=True)
|
||||
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
|
||||
return CommandResult(command=command, exit_code=0)
|
||||
|
||||
|
||||
def available(_: str) -> str:
|
||||
return "C:/local/bin/mineru.exe"
|
||||
|
||||
|
||||
def missing(_: str) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def test_availability_check_uses_mockable_which() -> None:
|
||||
assert MinerUAdapter(which=available, runner=FakeRunner()).is_available() is True
|
||||
assert MinerUAdapter(which=missing, runner=FakeRunner()).is_available() is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("executable", ["mineru-api", "python", "C:/tools/mineru.exe"])
|
||||
def test_custom_executable_is_rejected(executable: str) -> None:
|
||||
with pytest.raises(StrictLocalViolationError):
|
||||
MinerUAdapter(executable=executable, which=available, runner=FakeRunner())
|
||||
|
||||
|
||||
def test_missing_mineru_does_not_call_runner(tmp_path: Path) -> None:
|
||||
runner = FakeRunner()
|
||||
adapter = MinerUAdapter(which=missing, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work")
|
||||
|
||||
assert result.succeeded is False
|
||||
assert result.exit_code is None
|
||||
assert runner.commands == []
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING]
|
||||
|
||||
|
||||
def test_missing_mineru_version_does_not_call_runner() -> None:
|
||||
runner = FakeRunner()
|
||||
adapter = MinerUAdapter(which=missing, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is False
|
||||
assert result.exit_code is None
|
||||
assert runner.commands == []
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING]
|
||||
|
||||
|
||||
def test_version_success_uses_stdout() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="MinerU 3.1.0\n"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is True
|
||||
assert result.version == "MinerU 3.1.0"
|
||||
assert result.command == ("mineru", "--version")
|
||||
assert runner.commands == [("mineru", "--version")]
|
||||
|
||||
|
||||
def test_version_success_can_use_stderr() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stderr="MinerU 3.1.0\n"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.version == "MinerU 3.1.0"
|
||||
|
||||
|
||||
def test_version_failure_is_explicit() -> None:
|
||||
runner = FakeRunner(CommandResult((), 2, stdout="", stderr="bad version"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.version is None
|
||||
assert result.exit_code == 2
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_version_empty_output_is_explicit() -> None:
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="", stderr=""))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.version()
|
||||
|
||||
assert result.available is True
|
||||
assert result.version is None
|
||||
assert result.exit_code == 0
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_build_command_is_list_based_and_deterministic(tmp_path: Path) -> None:
|
||||
adapter = MinerUAdapter(which=available, runner=FakeRunner())
|
||||
input_pdf = tmp_path / "논문 with spaces.pdf"
|
||||
work_dir = tmp_path / "work output"
|
||||
|
||||
command = adapter.build_command(input_pdf, work_dir)
|
||||
|
||||
assert command == ("mineru", "-p", str(input_pdf), "-o", str(work_dir))
|
||||
assert "--api-url" not in command
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
MinerUOptions(extra_cli_args=("--api-url", "http://example.test")),
|
||||
MinerUOptions(engine_options={"api_url": "http://example.test"}),
|
||||
MinerUOptions(engine_options={"base_url": "http://example.test"}),
|
||||
MinerUOptions(engine_options={"mode": "router"}),
|
||||
MinerUOptions(engine_options={"backend": "http"}),
|
||||
MinerUOptions(engine_options={"openai_base_url": "http://example.test/v1"}),
|
||||
MinerUOptions(engine_options={"endpoint": "https://example.test"}),
|
||||
MinerUOptions(engine_options={"nested": {"url": "local http://example.test"}}),
|
||||
MinerUOptions(engine_options={"process": "mineru-api"}),
|
||||
MinerUOptions(gpu_device="https://example.test/gpu"),
|
||||
MinerUOptions(strict_local=False),
|
||||
],
|
||||
)
|
||||
def test_strict_local_rejects_remote_router_and_backend_options(tmp_path: Path, options: MinerUOptions) -> None:
|
||||
adapter = MinerUAdapter(which=available, runner=FakeRunner())
|
||||
|
||||
with pytest.raises(StrictLocalViolationError):
|
||||
adapter.build_command(tmp_path / "paper.pdf", tmp_path / "work", options)
|
||||
|
||||
|
||||
def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
(work_dir / "nested").mkdir(parents=True)
|
||||
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
|
||||
(work_dir / "structured.json").write_text('{"pages": 1}', encoding="utf-8")
|
||||
(work_dir / "assets" / "z.png").parent.mkdir()
|
||||
(work_dir / "assets" / "z.png").write_bytes(b"z")
|
||||
(work_dir / "assets" / "a.png").write_bytes(b"a")
|
||||
(work_dir / "assets" / "nested").mkdir()
|
||||
(work_dir / "assets" / "nested" / "b.png").write_bytes(b"b")
|
||||
(work_dir / "zz_extra.md").write_text("not an asset", encoding="utf-8")
|
||||
(work_dir / "zz_extra.json").write_text("{}", encoding="utf-8")
|
||||
(work_dir / "run.log").write_text("diagnostic", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 0, stdout="ok", stderr="warn"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(
|
||||
tmp_path / "paper.pdf",
|
||||
work_dir,
|
||||
MinerUOptions(engine_version="3.1.0", gpu_device="cuda:0"),
|
||||
)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.command == ("mineru", "-p", str(tmp_path / "paper.pdf"), "-o", str(work_dir))
|
||||
assert result.raw_markdown == "# Title\n"
|
||||
assert result.raw_structured == {"pages": 1}
|
||||
assert [path.relative_to(work_dir).as_posix() for path in result.asset_paths] == [
|
||||
"assets/a.png",
|
||||
"assets/nested/b.png",
|
||||
"assets/z.png",
|
||||
]
|
||||
assert result.engine == "MinerU"
|
||||
assert result.engine_version == "3.1.0"
|
||||
assert result.engine_options == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
assert result.exit_code == 0
|
||||
assert result.stdout == "ok"
|
||||
assert result.stderr == "warn"
|
||||
|
||||
|
||||
def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_path: Path, monkeypatch) -> None:
|
||||
monkeypatch.setenv("MINERU_DEVICE_MODE", "cpu")
|
||||
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "7")
|
||||
runner = EnvironmentRunner()
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work", MinerUOptions(gpu_device="cuda:0"))
|
||||
|
||||
assert result.succeeded is True
|
||||
assert runner.mineru_device_mode == "cuda"
|
||||
assert runner.cuda_visible_devices == "0"
|
||||
assert os.environ["MINERU_DEVICE_MODE"] == "cpu"
|
||||
assert os.environ["CUDA_VISIBLE_DEVICES"] == "7"
|
||||
|
||||
|
||||
def test_nonzero_exit_does_not_parse_existing_outputs_or_fallback(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
(work_dir / "paper.md").write_text("existing output", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 3, stdout="out", stderr="failed"))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is False
|
||||
assert result.raw_markdown is None
|
||||
assert result.asset_paths == ()
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_exit_zero_with_no_usable_output_warns(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
runner = FakeRunner(CommandResult((), 0))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is False
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
assert "no usable" in result.warnings[0].message
|
||||
|
||||
|
||||
def test_invalid_json_is_preserved_as_text_with_warning(tmp_path: Path) -> None:
|
||||
work_dir = tmp_path / "work"
|
||||
work_dir.mkdir()
|
||||
(work_dir / "paper.md").write_text("markdown", encoding="utf-8")
|
||||
(work_dir / "structured.json").write_text("{not json", encoding="utf-8")
|
||||
runner = FakeRunner(CommandResult((), 0))
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert result.raw_structured == "{not json"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
@@ -0,0 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pdf2md
|
||||
|
||||
|
||||
def test_package_imports() -> None:
|
||||
assert pdf2md.__version__ == "0.1.0"
|
||||
assert callable(pdf2md.convert_pdf)
|
||||
@@ -0,0 +1,188 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.paths import (
|
||||
DiscoveredPdf,
|
||||
DuplicateOutputPathError,
|
||||
InputDiscoveryError,
|
||||
OutputConflictError,
|
||||
OutputPathError,
|
||||
OutputRootError,
|
||||
discover_pdfs,
|
||||
plan_outputs,
|
||||
plan_pdf_outputs,
|
||||
)
|
||||
|
||||
|
||||
def touch(path: Path) -> Path:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"")
|
||||
return path
|
||||
|
||||
|
||||
def test_discovers_single_pdf_case_insensitive(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "Paper.PDF")
|
||||
|
||||
discovered = discover_pdfs(pdf)
|
||||
|
||||
assert discovered == (DiscoveredPdf(source_path=pdf.resolve()),)
|
||||
|
||||
|
||||
def test_rejects_nonexistent_and_non_pdf_inputs(tmp_path: Path) -> None:
|
||||
with pytest.raises(InputDiscoveryError, match="does not exist"):
|
||||
discover_pdfs(tmp_path / "missing.pdf")
|
||||
|
||||
text_file = touch(tmp_path / "notes.txt")
|
||||
with pytest.raises(InputDiscoveryError, match="not a PDF"):
|
||||
discover_pdfs(text_file)
|
||||
|
||||
|
||||
def test_discovers_directory_non_recursive_only(tmp_path: Path) -> None:
|
||||
root_pdf = touch(tmp_path / "root.pdf")
|
||||
nested_pdf = touch(tmp_path / "nested" / "child.pdf")
|
||||
|
||||
discovered = discover_pdfs(tmp_path, recursive=False)
|
||||
|
||||
assert [item.source_path for item in discovered] == [root_pdf.resolve()]
|
||||
assert nested_pdf.resolve() not in {item.source_path for item in discovered}
|
||||
|
||||
|
||||
def test_non_recursive_directory_with_only_nested_pdfs_fails(tmp_path: Path) -> None:
|
||||
touch(tmp_path / "nested" / "child.pdf")
|
||||
|
||||
with pytest.raises(InputDiscoveryError, match="no PDF files"):
|
||||
discover_pdfs(tmp_path, recursive=False)
|
||||
|
||||
|
||||
def test_discovers_directory_recursive_with_relative_parents(tmp_path: Path) -> None:
|
||||
root_pdf = touch(tmp_path / "root.pdf")
|
||||
nested_pdf = touch(tmp_path / "nested" / "child.pdf")
|
||||
deeper_pdf = touch(tmp_path / "nested" / "deeper" / "leaf.PdF")
|
||||
|
||||
discovered = discover_pdfs(tmp_path, recursive=True)
|
||||
|
||||
assert [(item.source_path, item.relative_parent) for item in discovered] == [
|
||||
(nested_pdf.resolve(), Path("nested")),
|
||||
(deeper_pdf.resolve(), Path("nested") / "deeper"),
|
||||
(root_pdf.resolve(), Path()),
|
||||
]
|
||||
|
||||
|
||||
def test_discovery_order_is_deterministic_for_non_ascii_names(tmp_path: Path) -> None:
|
||||
touch(tmp_path / "한글.pdf")
|
||||
touch(tmp_path / "Alpha.pdf")
|
||||
touch(tmp_path / "beta.PDF")
|
||||
|
||||
first = discover_pdfs(tmp_path)
|
||||
second = discover_pdfs(tmp_path)
|
||||
|
||||
assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", "한글.pdf"]
|
||||
assert first == second
|
||||
|
||||
|
||||
def test_plans_all_default_output_paths_for_single_pdf(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "입력.pdf")
|
||||
output_root = tmp_path / "out"
|
||||
|
||||
[plan] = plan_pdf_outputs(pdf, output_root)
|
||||
|
||||
assert plan.source_pdf == pdf.resolve()
|
||||
assert plan.markdown_path == output_root.resolve() / "입력.md"
|
||||
assert plan.assets_dir == output_root.resolve() / "입력.assets"
|
||||
assert plan.metadata_path == output_root.resolve() / "입력.metadata.json"
|
||||
assert plan.report_path == output_root.resolve() / "입력.report.md"
|
||||
assert plan.raw_dir is None
|
||||
|
||||
|
||||
def test_plans_optional_metadata_and_raw_outputs(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
|
||||
[without_metadata] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=False)
|
||||
[with_raw] = plan_pdf_outputs(pdf, tmp_path / "out", keep_raw=True)
|
||||
|
||||
assert without_metadata.metadata_path is None
|
||||
assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper.report.md"
|
||||
assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper.raw"
|
||||
|
||||
|
||||
def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) -> None:
|
||||
root = tmp_path / "pdfs"
|
||||
touch(root / "same.pdf")
|
||||
touch(root / "nested" / "same.pdf")
|
||||
|
||||
plans = plan_pdf_outputs(root, tmp_path / "out", recursive=True)
|
||||
|
||||
assert [plan.markdown_path.relative_to((tmp_path / "out").resolve()) for plan in plans] == [
|
||||
Path("nested") / "same.md",
|
||||
Path("same.md"),
|
||||
]
|
||||
|
||||
|
||||
def test_non_recursive_duplicate_output_paths_fail(tmp_path: Path) -> None:
|
||||
first = touch(tmp_path / "first" / "same.pdf")
|
||||
second = touch(tmp_path / "second" / "same.pdf")
|
||||
discovered = (
|
||||
DiscoveredPdf(source_path=first.resolve()),
|
||||
DiscoveredPdf(source_path=second.resolve()),
|
||||
)
|
||||
|
||||
with pytest.raises(DuplicateOutputPathError, match="duplicated"):
|
||||
plan_outputs(discovered, tmp_path / "out")
|
||||
|
||||
|
||||
def test_output_conflicts_report_all_existing_paths(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
output_root = tmp_path / "out"
|
||||
(output_root / "paper.assets").mkdir(parents=True)
|
||||
(output_root / "paper.md").mkdir()
|
||||
touch(output_root / "paper.metadata.json")
|
||||
|
||||
with pytest.raises(OutputConflictError) as error:
|
||||
plan_pdf_outputs(pdf, output_root)
|
||||
|
||||
conflict_names = {path.name for path in error.value.conflicts}
|
||||
assert conflict_names == {"paper.assets", "paper.md", "paper.metadata.json"}
|
||||
|
||||
|
||||
def test_overwrite_allows_existing_paths_without_deleting(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
output_root = tmp_path / "out"
|
||||
existing = touch(output_root / "paper.md")
|
||||
|
||||
[plan] = plan_pdf_outputs(pdf, output_root, overwrite=True)
|
||||
|
||||
assert plan.markdown_path == existing.resolve()
|
||||
assert existing.exists()
|
||||
|
||||
|
||||
def test_output_root_cannot_be_existing_file(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
output_root = touch(tmp_path / "out")
|
||||
|
||||
with pytest.raises(OutputRootError, match="not a directory"):
|
||||
plan_pdf_outputs(pdf, output_root)
|
||||
|
||||
|
||||
def test_planned_paths_cannot_escape_output_root(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=Path("..")),)
|
||||
|
||||
with pytest.raises(OutputPathError, match="escape"):
|
||||
plan_outputs(discovered, tmp_path / "out")
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.name != "nt", reason="Windows rooted path behavior")
|
||||
@pytest.mark.parametrize("relative_parent", [Path("\\outside"), Path("/outside"), Path("C:outside")])
|
||||
def test_windows_rooted_relative_parents_cannot_escape_output_root(
|
||||
tmp_path: Path,
|
||||
relative_parent: Path,
|
||||
) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=relative_parent),)
|
||||
|
||||
with pytest.raises(OutputPathError, match="escape"):
|
||||
plan_outputs(discovered, tmp_path / "out")
|
||||
@@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
from pdf2md.pdf_splitter import PdfChunkError, count_pdf_pages, plan_pdf_chunks, write_pdf_chunk
|
||||
|
||||
|
||||
def make_blank_pdf(path: Path, page_count: int) -> Path:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
writer = PdfWriter()
|
||||
for _ in range(page_count):
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with path.open("wb") as file:
|
||||
writer.write(file)
|
||||
return path
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("page_count", "expected_ranges"),
|
||||
[
|
||||
(1, [(1, 1)]),
|
||||
(20, [(1, 20)]),
|
||||
(21, [(1, 20), (21, 21)]),
|
||||
(40, [(1, 20), (21, 40)]),
|
||||
(41, [(1, 20), (21, 40), (41, 41)]),
|
||||
],
|
||||
)
|
||||
def test_plan_pdf_chunks_uses_one_based_ranges_and_names(
|
||||
tmp_path: Path,
|
||||
page_count: int,
|
||||
expected_ranges: list[tuple[int, int]],
|
||||
) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", page_count)
|
||||
|
||||
chunks = plan_pdf_chunks(pdf, chunk_pages=20)
|
||||
|
||||
assert count_pdf_pages(pdf) == page_count
|
||||
assert [(chunk.source_page_start, chunk.source_page_end) for chunk in chunks] == expected_ranges
|
||||
assert [chunk.output_filename for chunk in chunks] == [
|
||||
f"paper.part-{index:03d}.pages-{start:03d}-{end:03d}.pdf"
|
||||
for index, (start, end) in enumerate(expected_ranges, start=1)
|
||||
]
|
||||
|
||||
|
||||
def test_write_pdf_chunk_writes_expected_page_count(tmp_path: Path) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", 41)
|
||||
chunk = plan_pdf_chunks(pdf, chunk_pages=20)[1]
|
||||
|
||||
output = write_pdf_chunk(chunk, tmp_path / "chunks" / chunk.output_filename)
|
||||
|
||||
assert output.exists()
|
||||
assert len(PdfReader(output).pages) == 20
|
||||
|
||||
|
||||
def test_plan_pdf_chunks_rejects_non_positive_chunk_size(tmp_path: Path) -> None:
|
||||
pdf = make_blank_pdf(tmp_path / "paper.pdf", 1)
|
||||
|
||||
with pytest.raises(PdfChunkError, match="positive integer"):
|
||||
plan_pdf_chunks(pdf, chunk_pages=0)
|
||||
@@ -0,0 +1,144 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.ir import WarningCode, WarningSeverity
|
||||
from pdf2md.quality import (
|
||||
MathCheckerUnavailable,
|
||||
MathCheckResult,
|
||||
check_asset_links,
|
||||
check_math_renderability,
|
||||
extract_math_expressions,
|
||||
merge_quality_results,
|
||||
)
|
||||
|
||||
|
||||
def test_missing_asset_link_is_counted(tmp_path: Path) -> None:
|
||||
asset_root = tmp_path / "assets"
|
||||
asset_root.mkdir()
|
||||
|
||||
result = check_asset_links("", markdown_dir=tmp_path, asset_root=asset_root)
|
||||
|
||||
assert result.missing_asset_link_count == 1
|
||||
assert result.invalid_asset_link_count == 0
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING]
|
||||
|
||||
|
||||
def test_existing_asset_link_passes_without_warning(tmp_path: Path) -> None:
|
||||
asset_root = tmp_path / "assets"
|
||||
asset_root.mkdir()
|
||||
(asset_root / "fig.png").write_bytes(b"image")
|
||||
|
||||
result = check_asset_links("", markdown_dir=tmp_path, asset_root=asset_root)
|
||||
|
||||
assert result.failure_count == 0
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_invalid_asset_links_are_counted_without_fetching(tmp_path: Path) -> None:
|
||||
markdown = "\n".join(
|
||||
[
|
||||
"",
|
||||
"",
|
||||
r"",
|
||||
]
|
||||
)
|
||||
|
||||
result = check_asset_links(markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets")
|
||||
|
||||
assert result.invalid_asset_link_count == 3
|
||||
assert result.missing_asset_link_count == 0
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID] * 3
|
||||
|
||||
|
||||
def test_asset_links_inside_code_are_ignored(tmp_path: Path) -> None:
|
||||
markdown = "```md\n\n```\n``"
|
||||
|
||||
result = check_asset_links(markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets")
|
||||
|
||||
assert result.failure_count == 0
|
||||
assert result.warnings == ()
|
||||
|
||||
|
||||
def test_math_render_failures_are_aggregated_with_fake_checker() -> None:
|
||||
def checker(body: str) -> MathCheckResult:
|
||||
return MathCheckResult(ok="bad" not in body, message=f"{body} failed")
|
||||
|
||||
result = check_math_renderability("$x_i^2$\n\n$$\nbad_math\n$$", checker)
|
||||
|
||||
assert result.math_render_error_count == 1
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_FAILED]
|
||||
assert "bad_math failed" in result.warnings[0].message
|
||||
|
||||
|
||||
def test_math_extraction_records_display_mode_and_markdown_spans() -> None:
|
||||
markdown = "Inline $x_i^2$ before\n\n$$\n\\frac{1}{2}\n$$\n"
|
||||
|
||||
expressions = extract_math_expressions(markdown)
|
||||
|
||||
assert [(expression.index, expression.body, expression.display) for expression in expressions] == [
|
||||
(0, "x_i^2", False),
|
||||
(1, "\\frac{1}{2}", True),
|
||||
]
|
||||
assert [markdown[start:end] for start, end in (expression.markdown_span for expression in expressions)] == [
|
||||
"$x_i^2$",
|
||||
"$$\n\\frac{1}{2}\n$$",
|
||||
]
|
||||
|
||||
|
||||
def test_math_extraction_ignores_code_and_currency_like_text() -> None:
|
||||
markdown = "```tex\n$x$\n```\n`$y$`\nPrice $12.00$ and real $z$."
|
||||
|
||||
expressions = extract_math_expressions(markdown)
|
||||
|
||||
assert [(expression.body, expression.display) for expression in expressions] == [("z", False)]
|
||||
|
||||
|
||||
def test_batch_math_checker_receives_expression_records() -> None:
|
||||
class BatchChecker:
|
||||
def __init__(self) -> None:
|
||||
self.expressions = ()
|
||||
|
||||
def check_expressions(self, expressions):
|
||||
self.expressions = expressions
|
||||
return tuple(MathCheckResult(ok=expression.display) for expression in expressions)
|
||||
|
||||
checker = BatchChecker()
|
||||
result = check_math_renderability("$inline$\n\n$$\ndisplay\n$$", checker)
|
||||
|
||||
assert [expression.body for expression in checker.expressions] == ["inline", "display"]
|
||||
assert result.math_render_error_count == 1
|
||||
assert "inline" in result.warnings[0].message
|
||||
|
||||
|
||||
def test_math_checker_unavailable_is_nonfatal() -> None:
|
||||
def checker(_: str) -> bool:
|
||||
raise MathCheckerUnavailable("local renderer missing")
|
||||
|
||||
result = check_math_renderability("$x$", checker)
|
||||
|
||||
assert result.math_render_error_count == 0
|
||||
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
|
||||
assert result.warnings[0].severity == WarningSeverity.INFO
|
||||
|
||||
|
||||
def test_missing_math_checker_is_explicit_and_nonfatal() -> None:
|
||||
result = check_math_renderability("$x$")
|
||||
|
||||
assert result.math_render_error_count == 0
|
||||
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
|
||||
assert result.warnings[0].severity == WarningSeverity.INFO
|
||||
|
||||
|
||||
def test_merge_quality_results_combines_counts_and_warning_order(tmp_path: Path) -> None:
|
||||
asset_result = check_asset_links("", markdown_dir=tmp_path)
|
||||
math_result = check_math_renderability("$x$", lambda _: False)
|
||||
|
||||
result = merge_quality_results(asset_result, math_result)
|
||||
|
||||
assert result.missing_asset_link_count == 1
|
||||
assert result.math_render_error_count == 1
|
||||
assert [warning.code for warning in result.warnings] == [
|
||||
WarningCode.ASSET_LINK_MISSING,
|
||||
WarningCode.MATH_RENDER_FAILED,
|
||||
]
|
||||
@@ -0,0 +1,163 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pdf2md.ir import (
|
||||
AssetRecord,
|
||||
BlockRecord,
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
)
|
||||
from pdf2md.metadata import build_metadata
|
||||
from pdf2md.quality import QualityResult
|
||||
from pdf2md.report import determine_final_status, pages_with_warnings, render_report
|
||||
|
||||
|
||||
def make_metadata(tmp_path: Path, *, warnings: tuple[WarningRecord, ...] = ()) -> dict[str, object]:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(
|
||||
PageRecord(
|
||||
page_index=0,
|
||||
blocks=(
|
||||
BlockRecord(BlockType.INLINE_FORMULA, page_index=0),
|
||||
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0),
|
||||
),
|
||||
),
|
||||
PageRecord(page_index=1, blocks=(BlockRecord(BlockType.PARAGRAPH, page_index=1),)),
|
||||
),
|
||||
assets=(AssetRecord("paper.assets/fig.png", page_index=1),),
|
||||
warnings=warnings,
|
||||
)
|
||||
return build_metadata(
|
||||
document=document,
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-08T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options={"strict_local": True},
|
||||
)
|
||||
|
||||
|
||||
def test_final_status_success_partial_and_failed(tmp_path: Path) -> None:
|
||||
success_metadata = make_metadata(tmp_path)
|
||||
warning_metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
||||
)
|
||||
failed_metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
|
||||
)
|
||||
|
||||
assert determine_final_status(success_metadata) == "success"
|
||||
assert determine_final_status(warning_metadata) == "partial"
|
||||
assert determine_final_status(success_metadata, QualityResult(missing_asset_link_count=1)) == "partial"
|
||||
assert determine_final_status(failed_metadata) == "failed"
|
||||
|
||||
|
||||
def test_pages_with_warnings_are_sorted_and_derived_from_metadata_and_quality(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
||||
)
|
||||
quality = QualityResult(
|
||||
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.WARNING, "Math failed.", page_index=0),)
|
||||
)
|
||||
|
||||
assert pages_with_warnings(metadata, quality) == (0, 1)
|
||||
|
||||
|
||||
def test_report_content_includes_required_sections_and_counts(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
|
||||
)
|
||||
quality = QualityResult(
|
||||
missing_asset_link_count=2,
|
||||
invalid_asset_link_count=1,
|
||||
math_render_error_count=3,
|
||||
warnings=(WarningRecord(WarningCode.ASSET_LINK_MISSING, WarningSeverity.WARNING, "Missing asset."),),
|
||||
)
|
||||
|
||||
report = render_report(
|
||||
metadata,
|
||||
quality=quality,
|
||||
markdown_path=tmp_path / "paper.md",
|
||||
metadata_path=tmp_path / "paper.metadata.json",
|
||||
report_path=tmp_path / "paper.report.md",
|
||||
)
|
||||
|
||||
assert "# PDF-to-Markdown Quality Report" in report
|
||||
assert "- Final status: `partial`" in report
|
||||
assert f"- Source PDF: {tmp_path / 'paper.pdf'}" in report
|
||||
assert f"- Output Markdown: {tmp_path / 'paper.md'}" in report
|
||||
assert "- Engine: MinerU" in report
|
||||
assert "- Engine version: 3.1.0" in report
|
||||
assert '- Engine options: `{"strict_local": true}`' in report
|
||||
assert "- Pages processed: 2" in report
|
||||
assert "- Warning count: 2" in report
|
||||
assert "- Asset count: 1" in report
|
||||
assert "- Missing asset link count: 2" in report
|
||||
assert "- Invalid asset link count: 1" in report
|
||||
assert "- Inline formula count: 1" in report
|
||||
assert "- Display formula count: 1" in report
|
||||
assert "- Math render error count: 3" in report
|
||||
assert "- Page 1" in report
|
||||
assert "`ASSET_LINK_MISSING`" in report
|
||||
|
||||
|
||||
def test_report_omits_absent_optional_paths_and_does_not_write_files(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(tmp_path)
|
||||
report_path = tmp_path / "paper.report.md"
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "Output Markdown:" not in report
|
||||
assert "Metadata JSON:" not in report
|
||||
assert "Report Markdown:" not in report
|
||||
assert not report_path.exists()
|
||||
|
||||
|
||||
def test_report_failed_status_comes_from_error_severity_warning(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
|
||||
)
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "- Final status: `failed`" in report
|
||||
|
||||
|
||||
def test_report_uses_metadata_math_render_count_plus_quality_count(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(
|
||||
tmp_path,
|
||||
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Metadata math failed."),),
|
||||
)
|
||||
quality = QualityResult(math_render_error_count=2)
|
||||
|
||||
report = render_report(metadata, quality=quality)
|
||||
|
||||
assert "- Math render error count: 3" in report
|
||||
|
||||
|
||||
def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(tmp_path)
|
||||
metadata["engine_options"] = {
|
||||
"strict_local": True,
|
||||
"chunk": {
|
||||
"chunk_index": 2,
|
||||
"total_chunks": 3,
|
||||
"source_page_start": 21,
|
||||
"source_page_end": 40,
|
||||
},
|
||||
}
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "- Chunk: 2/3, source pages: 21-40" in report
|
||||
Reference in New Issue
Block a user