add pdftomd

This commit is contained in:
김경종
2026-05-08 16:42:19 +09:00
parent 551ab50735
commit 88d6b92283
99 changed files with 47332 additions and 0 deletions
@@ -0,0 +1,118 @@
from __future__ import annotations
import json
import os
import re
import subprocess
import sys
from pathlib import Path
import pytest
if os.environ.get("PDF2MD_RUN_MINERU_FIXTURES") != "1":
pytest.skip(
"optional local MinerU fixture evaluation is disabled; set PDF2MD_RUN_MINERU_FIXTURES=1 to run",
allow_module_level=True,
)
REPO_ROOT = Path(__file__).resolve().parents[2]
SAMPLES_DIR = REPO_ROOT / "samples"
def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -> None:
doctor = subprocess.run(
[sys.executable, "-m", "pdf2md.cli", "doctor"],
cwd=REPO_ROOT,
check=False,
capture_output=True,
text=True,
)
if doctor.returncode != 0:
pytest.skip(f"local MinerU fixture evaluation blocked by doctor:\n{doctor.stdout}\n{doctor.stderr}")
sample_pdfs = tuple(sorted(SAMPLES_DIR.glob("*.pdf"), key=lambda path: path.name.casefold()))
if not sample_pdfs:
pytest.skip(f"no local sample PDFs found under {SAMPLES_DIR}")
output_root = tmp_path / "mineru-fixture-output"
attempts: list[dict[str, object]] = []
for pdf in sample_pdfs:
sample_output = output_root / pdf.stem
completed = subprocess.run(
[
sys.executable,
"-m",
"pdf2md.cli",
"convert",
str(pdf),
"--out",
str(sample_output),
],
cwd=REPO_ROOT,
check=False,
capture_output=True,
text=True,
timeout=1800,
)
attempts.append(
{
"source": str(pdf.relative_to(REPO_ROOT)),
"command": " ".join(
[
sys.executable,
"-m",
"pdf2md.cli",
"convert",
str(pdf),
"--out",
str(sample_output),
]
),
"exit_code": completed.returncode,
"stdout": completed.stdout,
"stderr": completed.stderr,
}
)
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
markdown_path = sample_output / f"{pdf.stem}.md"
metadata_path = sample_output / f"{pdf.stem}.metadata.json"
report_path = sample_output / f"{pdf.stem}.report.md"
assert markdown_path.exists()
assert metadata_path.exists()
assert report_path.exists()
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
summary = metadata["summary"]
assert metadata["engine"] == "MinerU"
assert summary["pages_processed"] >= 1
assert "warning_count" in summary
assert "math_render_error_count" in summary
assert "asset_count" in summary
report = report_path.read_text(encoding="utf-8")
assert "Output Markdown:" in report
assert "Metadata JSON:" in report
assert "Report Markdown:" in report
attempts[-1].update(
{
"markdown_path": str(markdown_path),
"metadata_path": str(metadata_path),
"report_path": str(report_path),
"warning_count": summary["warning_count"],
"final_status": _report_final_status(report),
"math_render_error_count": summary["math_render_error_count"],
"asset_count": summary["asset_count"],
"pages_processed": summary["pages_processed"],
}
)
record_path = output_root / "fixture-evaluation.json"
record_path.write_text(json.dumps({"attempts": attempts}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
assert record_path.exists()
def _report_final_status(report: str) -> str:
match = re.search(r"^- Final status: `(?P<status>[^`]+)`$", report, re.MULTILINE)
return match.group("status") if match else "unavailable"
@@ -0,0 +1,152 @@
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
from pdf2md.cli import main
from pdf2md.conversion import convert_pdf
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
from pdf2md.mineru_adapter import MinerUAdapterResult
class FixtureAdapter:
def __init__(
self,
*,
raw_markdown: str,
raw_structured: object | None = None,
succeeded: bool = True,
asset_name: str | None = None,
warnings: tuple[WarningRecord, ...] = (),
) -> None:
self.raw_markdown = raw_markdown
self.raw_structured = raw_structured
self.succeeded = succeeded
self.asset_name = asset_name
self.warnings = warnings
self.calls: list[tuple[Path, Path]] = []
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
input_path = Path(input_pdf)
output_dir = Path(work_dir)
output_dir.mkdir(parents=True, exist_ok=True)
self.calls.append((input_path, output_dir))
asset_paths: tuple[Path, ...] = ()
if self.asset_name is not None:
asset_path = output_dir / "assets" / self.asset_name
asset_path.parent.mkdir(parents=True, exist_ok=True)
asset_path.write_bytes(b"fake image")
asset_paths = (asset_path,)
failure = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
return MinerUAdapterResult(
succeeded=self.succeeded,
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
input_pdf=input_path,
work_dir=output_dir,
raw_markdown=self.raw_markdown if self.succeeded else None,
raw_structured=self.raw_structured,
asset_paths=asset_paths,
warnings=self.warnings if self.succeeded else (failure,),
engine="MinerU",
engine_version="3.1.0",
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
exit_code=0 if self.succeeded else 2,
stdout="",
stderr="",
)
def fixed_clock() -> datetime:
return datetime(2026, 5, 8, tzinfo=timezone.utc)
def make_pdf(directory: Path, name: str) -> Path:
path = directory / name
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(b"%PDF-1.7\nfast integration fixture\n")
return path
def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_counts(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path, "쉘구조_math.pdf")
adapter = FixtureAdapter(
raw_markdown=(
"# Shell Element\n\n"
"Inline \\(u_i\\) and display:\n\n"
"\\[\nK u = f\n\\]\n\n"
'<table><tr><td rowspan="2">\\(N_i\\)</td><td>stress</td></tr></table>\n\n'
"![mesh](assets/mesh.png)\n"
),
raw_structured={"pages": [{}, {}, {}]},
asset_name="mesh.png",
)
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
assert result.final_status == "partial"
assert result.markdown_path.exists()
assert result.metadata_path is not None and result.metadata_path.exists()
assert result.report_path.exists()
assert (tmp_path / "out" / "쉘구조_math.assets" / "mesh.png").read_bytes() == b"fake image"
markdown = result.markdown_path.read_text(encoding="utf-8")
assert "$u_i$" in markdown
assert "$$\nK u = f\n$$" in markdown
assert "![mesh](쉘구조_math.assets/mesh.png)" in markdown
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["engine"] == "MinerU"
assert metadata["engine_version"] == "3.1.0"
assert metadata["summary"]["pages_processed"] == 3
assert metadata["summary"]["asset_count"] == 1
assert metadata["summary"]["inline_formula_count"] == 1
assert metadata["summary"]["display_formula_count"] == 1
assert metadata["summary"]["math_render_error_count"] == 0
assert metadata["summary"]["warning_count"] == 1
assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK"
assert metadata["assets"] == [{"relative_path": "쉘구조_math.assets/mesh.png"}]
report = result.report_path.read_text(encoding="utf-8")
assert "- Final status: `partial`" in report
assert "- Output Markdown:" in report
assert "- Metadata JSON:" in report
assert "- Report Markdown:" in report
assert "- Math render error count: 0" in report
assert "`TABLE_FALLBACK`" in report
def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path, "failed.pdf")
adapter = FixtureAdapter(raw_markdown="", succeeded=False)
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
assert result.final_status == "failed"
assert result.warning_count == 1
assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED
assert not result.markdown_path.exists()
assert not result.report_path.exists()
assert result.metadata_path is not None and not result.metadata_path.exists()
def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None:
source = tmp_path / "pdfs"
first = make_pdf(source, "a.pdf")
second = make_pdf(source, "한글.pdf")
adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1})
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
captured = capsys.readouterr()
assert exit_code == 0
assert [call[0] for call in adapter.calls] == [first.resolve(), second.resolve()]
assert "converted: 2" in captured.out
assert "failed: 0" in captured.out
assert "warnings: 0" in captured.out
assert (tmp_path / "out" / "a.md").exists()
assert (tmp_path / "out" / "a.metadata.json").exists()
assert (tmp_path / "out" / "a.report.md").exists()
assert (tmp_path / "out" / "한글.md").exists()
assert (tmp_path / "out" / "한글.metadata.json").exists()
assert (tmp_path / "out" / "한글.report.md").exists()
+232
View File
@@ -0,0 +1,232 @@
from __future__ import annotations
import subprocess
import sys
from datetime import datetime, timezone
from importlib.metadata import entry_points
from pathlib import Path
import pytest
from pypdf import PdfWriter
from pdf2md.cli import main
from pdf2md.doctor import DoctorCheck, DoctorReport
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
from pdf2md.mineru_adapter import MinerUAdapterResult
class FakeAdapter:
def __init__(self, *, succeeded: bool = True) -> None:
self.succeeded = succeeded
self.calls: list[Path] = []
self.options: list[object] = []
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
input_path = Path(input_pdf)
output_dir = Path(work_dir)
output_dir.mkdir(parents=True, exist_ok=True)
self.calls.append(input_path)
self.options.append(options)
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
return MinerUAdapterResult(
succeeded=self.succeeded,
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
input_pdf=input_path,
work_dir=output_dir,
raw_markdown=f"# {input_path.stem}\n" if self.succeeded else None,
raw_structured={"pages": 1},
asset_paths=(),
warnings=() if self.succeeded else (warning,),
engine="MinerU",
engine_version="3.1.0",
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
exit_code=0 if self.succeeded else 2,
stdout="",
stderr="",
)
def fixed_clock() -> datetime:
return datetime(2026, 5, 8, tzinfo=timezone.utc)
def make_pdf(directory: Path, name: str) -> Path:
path = directory / name
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(b"%PDF-1.7\n")
return path
def make_pdf_with_pages(directory: Path, name: str, page_count: int) -> Path:
path = directory / name
path.parent.mkdir(parents=True, exist_ok=True)
writer = PdfWriter()
for _ in range(page_count):
writer.add_blank_page(width=72, height=72)
with path.open("wb") as file:
writer.write(file)
return path
def test_console_script_entry_point_is_reserved() -> None:
scripts = {entry_point.name: entry_point for entry_point in entry_points(group="console_scripts")}
assert scripts["pdf2md"].value == "pdf2md.cli:main"
def test_cli_no_args_prints_help(capsys) -> None:
assert main([]) == 0
captured = capsys.readouterr()
assert "usage: pdf2md" in captured.out
assert "convert" in captured.out
assert "--no-strict-local" not in captured.out
def test_cli_version_module_execution() -> None:
completed = subprocess.run(
[sys.executable, "-m", "pdf2md.cli", "--version"],
check=False,
capture_output=True,
text=True,
)
assert completed.returncode == 0
assert completed.stdout.strip() == "pdf2md 0.1.0"
def test_cli_doctor_success_returns_zero(capsys) -> None:
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("python", "pass", "ok"),)))
captured = capsys.readouterr()
assert exit_code == 0
assert "Doctor status: PASS" in captured.out
assert "[PASS] python: ok" in captured.out
def test_cli_doctor_warning_only_returns_zero(capsys) -> None:
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("gpu", "warn", "missing"),)))
captured = capsys.readouterr()
assert exit_code == 0
assert "Doctor status: WARN" in captured.out
assert "[WARN] gpu: missing" in captured.out
def test_cli_doctor_failure_returns_nonzero(capsys) -> None:
exit_code = main(["doctor"], doctor_runner=lambda: DoctorReport((DoctorCheck("mineru", "fail", "missing"),)))
captured = capsys.readouterr()
assert exit_code == 1
assert "Doctor status: FAIL" in captured.out
assert "[FAIL] mineru: missing" in captured.out
def test_cli_convert_single_pdf_writes_outputs_and_summary(tmp_path: Path, capsys) -> None:
pdf = make_pdf(tmp_path, "paper.pdf")
out = tmp_path / "out"
adapter = FakeAdapter()
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
captured = capsys.readouterr()
assert exit_code == 0
assert "converted: 1" in captured.out
assert "failed: 0" in captured.out
assert "warnings: 0" in captured.out
assert (out / "paper.md").exists()
assert (out / "paper.metadata.json").exists()
assert (out / "paper.report.md").exists()
assert adapter.calls == [pdf.resolve()]
assert adapter.options[0].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
def test_cli_convert_directory_is_deterministic(tmp_path: Path, capsys) -> None:
source = tmp_path / "pdfs"
make_pdf(source, "b.pdf")
make_pdf(source, "a.pdf")
adapter = FakeAdapter()
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
captured = capsys.readouterr()
assert exit_code == 0
assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf"]
assert "converted: 2" in captured.out
assert captured.out.index("a.pdf") < captured.out.index("b.pdf")
def test_cli_convert_recursive_only_when_requested(tmp_path: Path, capsys) -> None:
source = tmp_path / "pdfs"
make_pdf(source, "top.pdf")
make_pdf(source / "nested", "child.pdf")
adapter = FakeAdapter()
exit_code = main(
["convert", str(source), "--out", str(tmp_path / "out"), "--recursive"],
adapter=adapter,
clock=fixed_clock,
)
captured = capsys.readouterr()
assert exit_code == 0
assert [path.name for path in adapter.calls] == ["child.pdf", "top.pdf"]
assert "converted: 2" in captured.out
assert (tmp_path / "out" / "nested" / "child.md").exists()
def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None:
pdf = make_pdf(tmp_path, "paper.pdf")
adapter = FakeAdapter(succeeded=False)
exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
captured = capsys.readouterr()
assert exit_code == 1
assert "failed: 1" in captured.out
assert "warnings: 1" in captured.out
assert not (tmp_path / "out" / "paper.md").exists()
def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None:
pdf = make_pdf(tmp_path, "paper.pdf")
out = tmp_path / "out"
out.mkdir()
(out / "paper.md").write_text("old", encoding="utf-8")
adapter = FakeAdapter()
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
captured = capsys.readouterr()
assert exit_code == 2
assert "planned outputs already exist" in captured.err
assert adapter.calls == []
def test_cli_convert_chunk_pages_flag_uses_default_twenty_pages(tmp_path: Path, capsys) -> None:
pdf = make_pdf_with_pages(tmp_path, "long.pdf", 21)
out = tmp_path / "out"
adapter = FakeAdapter()
exit_code = main(["convert", str(pdf), "--out", str(out), "--chunk-pages"], adapter=adapter, clock=fixed_clock)
captured = capsys.readouterr()
assert exit_code == 0
assert "converted: 2" in captured.out
assert [path.name for path in adapter.calls] == [
"long.part-001.pages-001-020.pdf",
"long.part-002.pages-021-021.pdf",
]
assert (out / "long.part-001.pages-001-020.md").exists()
assert (out / "long.part-002.pages-021-021.md").exists()
def test_cli_convert_rejects_non_positive_chunk_pages(tmp_path: Path, capsys) -> None:
pdf = make_pdf(tmp_path, "paper.pdf")
with pytest.raises(SystemExit) as error:
main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--chunk-pages", "0"])
captured = capsys.readouterr()
assert error.value.code == 2
assert "must be a positive integer" in captured.err
+418
View File
@@ -0,0 +1,418 @@
from __future__ import annotations
import hashlib
import json
from datetime import datetime, timezone
from pathlib import Path
import pytest
from pypdf import PdfWriter
import pdf2md.conversion as conversion_module
from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
from pdf2md.paths import OutputConflictError
class FakeAdapter:
def __init__(
self,
*,
raw_markdown: str = "# Title\n",
raw_structured: object | None = None,
succeeded: bool = True,
warnings: tuple[WarningRecord, ...] = (),
asset_name: str | None = None,
) -> None:
self.raw_markdown = raw_markdown
self.raw_structured = raw_structured
self.succeeded = succeeded
self.warnings = warnings
self.asset_name = asset_name
self.calls: list[tuple[Path, Path, object]] = []
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
input_path = Path(input_pdf)
output_dir = Path(work_dir)
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "raw.log").write_text("raw output", encoding="utf-8")
self.calls.append((input_path, output_dir, options))
asset_paths: tuple[Path, ...] = ()
if self.asset_name is not None:
asset_path = output_dir / "assets" / self.asset_name
asset_path.parent.mkdir(parents=True, exist_ok=True)
asset_path.write_bytes(b"asset")
asset_paths = (asset_path,)
return MinerUAdapterResult(
succeeded=self.succeeded,
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
input_pdf=input_path,
work_dir=output_dir,
raw_markdown=self.raw_markdown if self.succeeded else None,
raw_structured=self.raw_structured,
asset_paths=asset_paths,
warnings=self.warnings,
engine="MinerU",
engine_version="3.1.0",
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
exit_code=0 if self.succeeded else 2,
stdout="",
stderr="",
)
class SequencedAdapter:
def __init__(self, outcomes: tuple[bool, ...]) -> None:
self.outcomes = list(outcomes)
self.calls: list[Path] = []
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
input_path = Path(input_pdf)
output_dir = Path(work_dir)
output_dir.mkdir(parents=True, exist_ok=True)
self.calls.append(input_path)
succeeded = self.outcomes.pop(0)
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
return MinerUAdapterResult(
succeeded=succeeded,
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
input_pdf=input_path,
work_dir=output_dir,
raw_markdown=f"# {input_path.stem}\n" if succeeded else None,
raw_structured={"pages": 1},
asset_paths=(),
warnings=() if succeeded else (warning,),
engine="MinerU",
engine_version="3.1.0",
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
exit_code=0 if succeeded else 2,
stdout="",
stderr="",
)
class NestedMinerUAssetAdapter:
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
input_path = Path(input_pdf)
output_dir = Path(work_dir)
asset_path = output_dir / "paper" / "hybrid_auto" / "images" / "fig.png"
asset_path.parent.mkdir(parents=True, exist_ok=True)
asset_path.write_bytes(b"nested asset")
return MinerUAdapterResult(
succeeded=True,
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
input_pdf=input_path,
work_dir=output_dir,
raw_markdown="![fig](images/fig.png)\n\n\\[x^2\\]\n",
raw_structured=[{"page_idx": 0}, {"page_idx": 12}],
asset_paths=(asset_path,),
warnings=(),
engine="MinerU",
engine_version="3.1.0",
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
exit_code=0,
stdout="",
stderr="",
)
def fixed_clock() -> datetime:
return datetime(2026, 5, 8, tzinfo=timezone.utc)
def make_pdf(tmp_path: Path, name: str = "paper.pdf") -> Path:
path = tmp_path / name
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(b"%PDF-1.7\nlocal fixture\n")
return path
def make_pdf_with_pages(tmp_path: Path, page_count: int, name: str = "paper.pdf") -> Path:
path = tmp_path / name
path.parent.mkdir(parents=True, exist_ok=True)
writer = PdfWriter()
for _ in range(page_count):
writer.add_blank_page(width=72, height=72)
with path.open("wb") as file:
writer.write(file)
return path
def test_convert_pdf_writes_markdown_metadata_report_and_assets(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(
raw_markdown="# Title\n\nInline \\(x_i\\)\n\n![fig](assets/fig.png)\n",
raw_structured={"pages": [{}, {}]},
asset_name="fig.png",
)
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
assert result.succeeded is True
assert result.final_status == "success"
assert result.pages_processed == 2
assert result.warning_count == 0
assert result.engine == "MinerU"
assert result.engine_version == "3.1.0"
assert result.markdown_path.read_text(encoding="utf-8") == "# Title\n\nInline $x_i$\n\n![fig](paper.assets/fig.png)\n"
assert (tmp_path / "out" / "paper.assets" / "fig.png").read_bytes() == b"asset"
assert result.report_path.exists()
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
assert metadata["created_at"] == "2026-05-08T00:00:00Z"
assert metadata["summary"]["pages_processed"] == 2
assert metadata["summary"]["inline_formula_count"] == 1
assert metadata["summary"]["asset_count"] == 1
assert metadata["assets"] == [{"relative_path": "paper.assets/fig.png"}]
assert "- Final status: `success`" in result.report_path.read_text(encoding="utf-8")
assert not adapter.calls[0][1].exists()
def test_convert_pdf_adapter_failure_returns_failed_result_without_fallback_or_outputs(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
adapter = FakeAdapter(succeeded=False, warnings=(warning,))
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
assert result.succeeded is False
assert result.final_status == "failed"
assert result.warnings == (warning,)
assert len(adapter.calls) == 1
assert not result.markdown_path.exists()
assert not result.report_path.exists()
def test_convert_pdf_respects_output_conflicts_and_overwrite(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
out = tmp_path / "out"
out.mkdir()
(out / "paper.md").write_text("old", encoding="utf-8")
with pytest.raises(OutputConflictError):
convert_pdf(pdf, out, adapter=FakeAdapter(), clock=fixed_clock)
result = convert_pdf(pdf, out, adapter=FakeAdapter(raw_markdown="new\n"), clock=fixed_clock, overwrite=True)
assert result.succeeded is True
assert result.markdown_path.read_text(encoding="utf-8") == "new\n"
def test_convert_pdf_can_skip_metadata_json_but_still_writes_report(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
result = convert_pdf(pdf, tmp_path / "out", metadata=False, adapter=FakeAdapter(), clock=fixed_clock)
assert result.metadata_path is None
assert result.markdown_path.exists()
assert result.report_path.exists()
assert not (tmp_path / "out" / "paper.metadata.json").exists()
report = result.report_path.read_text(encoding="utf-8")
assert "Metadata JSON:" not in report
assert "Report Markdown:" in report
def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: False, clock=fixed_clock)
assert result.final_status == "partial"
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_FAILED]
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["summary"]["math_render_error_count"] == 1
assert metadata["warnings"][0]["code"] == "MATH_RENDER_FAILED"
report = result.report_path.read_text(encoding="utf-8")
assert "- Math render error count: 1" in report
assert "`MATH_RENDER_FAILED`" in report
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
monkeypatch.setattr(conversion_module, "create_default_math_checker", lambda: None)
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
assert result.final_status == "partial"
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
assert result.warnings[0].severity == WarningSeverity.INFO
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["summary"]["warning_count"] == 1
assert metadata["summary"]["math_render_error_count"] == 0
report = result.report_path.read_text(encoding="utf-8")
assert "unavailable" in report
assert "- Math render error count: 0" in report
def test_convert_pdf_uses_default_math_checker_when_available(tmp_path: Path, monkeypatch) -> None:
class DefaultChecker:
def __init__(self) -> None:
self.bodies: list[str] = []
def check_expressions(self, expressions):
self.bodies = [expression.body for expression in expressions]
return (True,)
checker = DefaultChecker()
pdf = make_pdf(tmp_path)
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
monkeypatch.setattr(conversion_module, "create_default_math_checker", lambda: checker)
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
assert result.final_status == "success"
assert result.warning_count == 0
assert checker.bodies == ["x"]
def test_convert_pdf_keep_raw_preserves_adapter_work_directory(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
result = convert_pdf(pdf, tmp_path / "out", keep_raw=True, adapter=FakeAdapter(), clock=fixed_clock)
assert result.raw_dir == tmp_path / "out" / "paper.raw"
assert (result.raw_dir / "raw.log").read_text(encoding="utf-8") == "raw output"
def test_convert_pdf_rejects_disabling_strict_local(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
with pytest.raises(StrictLocalViolationError):
convert_pdf(pdf, tmp_path / "out", strict_local=False, adapter=FakeAdapter(), clock=fixed_clock)
def test_convert_pdf_passes_gpu_device_to_strict_local_options(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter()
convert_pdf(pdf, tmp_path / "out", gpu="cuda:0", adapter=adapter, clock=fixed_clock)
assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
def test_convert_pdf_defaults_to_cuda_zero(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
adapter = FakeAdapter()
convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
def test_convert_pdf_rewrites_nested_mineru_image_links_and_page_indexes(tmp_path: Path) -> None:
pdf = make_pdf(tmp_path)
result = convert_pdf(
pdf,
tmp_path / "out",
adapter=NestedMinerUAssetAdapter(),
math_checker=lambda _: True,
clock=fixed_clock,
)
assert result.final_status == "success"
assert result.pages_processed == 13
markdown = result.markdown_path.read_text(encoding="utf-8")
assert "![fig](paper.assets/paper/hybrid_auto/images/fig.png)" in markdown
assert "](images/fig.png)" not in markdown
copied_asset = tmp_path / "out" / "paper.assets" / "paper" / "hybrid_auto" / "images" / "fig.png"
assert copied_asset.read_bytes() == b"nested asset"
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
assert metadata["summary"]["pages_processed"] == 13
assert metadata["summary"]["warning_count"] == 0
def test_convert_input_batch_continues_after_per_file_failure(tmp_path: Path) -> None:
source = tmp_path / "pdfs"
make_pdf(source, "a.pdf")
make_pdf(source, "b.pdf")
make_pdf(source, "c.pdf")
adapter = SequencedAdapter((True, False, True))
batch = convert_input(source, tmp_path / "out", adapter=adapter, clock=fixed_clock)
assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf", "c.pdf"]
assert batch.converted_count == 2
assert batch.failed_count == 1
assert (tmp_path / "out" / "a.md").exists()
assert not (tmp_path / "out" / "b.md").exists()
assert (tmp_path / "out" / "c.md").exists()
def test_convert_pdf_chunk_mode_returns_batch_and_deletes_temporary_chunk_pdfs(tmp_path: Path) -> None:
pdf = make_pdf_with_pages(tmp_path, 41, "thesis.pdf")
adapter = FakeAdapter(raw_structured={"pages": 1})
batch = convert_pdf(
pdf,
tmp_path / "out",
adapter=adapter,
math_checker=lambda _: True,
chunk_pages=20,
clock=fixed_clock,
)
assert isinstance(batch, BatchConversionResult)
assert batch.converted_count == 3
assert [result.markdown_path.name for result in batch.results] == [
"thesis.part-001.pages-001-020.md",
"thesis.part-002.pages-021-040.md",
"thesis.part-003.pages-041-041.md",
]
assert [path.name for path, _, _ in adapter.calls] == [
"thesis.part-001.pages-001-020.pdf",
"thesis.part-002.pages-021-040.pdf",
"thesis.part-003.pages-041-041.pdf",
]
assert all(result.source_pdf == pdf.resolve() for result in batch.results)
assert all(not path.exists() for path, _, _ in adapter.calls)
metadata = json.loads((tmp_path / "out" / "thesis.part-002.pages-021-040.metadata.json").read_text(encoding="utf-8"))
assert metadata["source_pdf"] == str(pdf.resolve())
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
assert metadata["engine_options"]["chunk"] == {
"chunk_index": 2,
"chunk_page_count": 20,
"chunk_pdf_name": "thesis.part-002.pages-021-040.pdf",
"original_source_pdf": str(pdf.resolve()),
"source_page_end": 40,
"source_page_start": 21,
"total_chunks": 3,
}
report = (tmp_path / "out" / "thesis.part-002.pages-021-040.report.md").read_text(encoding="utf-8")
assert "- Chunk: 2/3, source pages: 21-40" in report
def test_convert_pdf_chunk_mode_keeps_short_pdf_as_single_batch_result(tmp_path: Path) -> None:
pdf = make_pdf_with_pages(tmp_path, 3, "short.pdf")
adapter = FakeAdapter(raw_structured={"pages": 3})
batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
assert isinstance(batch, BatchConversionResult)
assert batch.converted_count == 1
assert batch.results[0].markdown_path.name == "short.md"
assert adapter.calls[0][0] == pdf.resolve()
assert adapter.calls[0][0].exists()
def test_convert_input_chunk_mode_continues_after_failed_chunk(tmp_path: Path) -> None:
pdf = make_pdf_with_pages(tmp_path, 41, "paper.pdf")
adapter = SequencedAdapter((True, False, True))
batch = convert_input(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
assert batch.converted_count == 2
assert batch.failed_count == 1
assert [path.name for path in adapter.calls] == [
"paper.part-001.pages-001-020.pdf",
"paper.part-002.pages-021-040.pdf",
"paper.part-003.pages-041-041.pdf",
]
assert (tmp_path / "out" / "paper.part-001.pages-001-020.md").exists()
assert not (tmp_path / "out" / "paper.part-002.pages-021-040.md").exists()
assert (tmp_path / "out" / "paper.part-003.pages-041-041.md").exists()
+311
View File
@@ -0,0 +1,311 @@
from __future__ import annotations
from pathlib import Path
from pdf2md.doctor import DoctorCommandResult, DoctorReport, format_doctor_report, run_doctor
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
from pdf2md.math_render import default_mathjax_helper_path
from pdf2md.mineru_adapter import MinerUVersionResult
class FakeMinerUProbe:
def __init__(self, result: MinerUVersionResult) -> None:
self.result = result
def version(self) -> MinerUVersionResult:
return self.result
class FakeCuda:
def __init__(
self,
*,
available: bool = True,
devices: tuple[str, ...] = ("NVIDIA RTX 4060",),
capabilities: tuple[tuple[int, int], ...] = ((8, 9),),
) -> None:
self._available = available
self._devices = devices
self._capabilities = capabilities
def is_available(self) -> bool:
return self._available
def device_count(self) -> int:
return len(self._devices)
def get_device_name(self, index: int) -> str:
return self._devices[index]
def get_device_capability(self, index: int) -> tuple[int, int]:
return self._capabilities[index]
class FakeTorchVersion:
cuda = "12.8"
class FakeTorch:
__version__ = "2.8.0+cu128"
version = FakeTorchVersion()
def __init__(self, cuda: FakeCuda) -> None:
self.cuda = cuda
def test_doctor_all_checks_pass_with_mocked_tools(tmp_path: Path) -> None:
report = make_report(
tmp_path,
env={"HF_HOME": str(tmp_path / "hf")},
existing_paths={tmp_path / "hf"},
)
assert report.status == "pass"
assert report.exit_code == 0
assert [check.name for check in report.checks] == [
"python",
"uv",
"mineru",
"gpu",
"pytorch",
"models",
"mathjax",
"local-only",
]
def test_doctor_fails_outside_python_312(tmp_path: Path) -> None:
report = make_report(tmp_path, python_version=(3, 11, 9))
python_check = find_check(report, "python")
assert report.status == "fail"
assert python_check.status == "fail"
assert "use Python 3.12.x" in python_check.message
def test_doctor_fails_when_uv_is_missing(tmp_path: Path) -> None:
report = make_report(tmp_path, available_tools={"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe"})
uv_check = find_check(report, "uv")
assert report.status == "fail"
assert uv_check.status == "fail"
assert "uv executable was not found" in uv_check.message
def test_doctor_fails_when_mineru_is_missing(tmp_path: Path) -> None:
report = make_report(
tmp_path,
mineru_result=MinerUVersionResult(
available=False,
version=None,
command=("mineru", "--version"),
exit_code=None,
stdout="",
stderr="",
),
)
mineru_check = find_check(report, "mineru")
assert report.status == "fail"
assert report.exit_code == 1
assert mineru_check.status == "fail"
assert "MinerU CLI executable was not found" in mineru_check.message
def test_doctor_warns_when_mineru_version_command_fails(tmp_path: Path) -> None:
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU version command failed.")
report = make_report(
tmp_path,
mineru_result=MinerUVersionResult(
available=True,
version=None,
command=("mineru", "--version"),
exit_code=2,
stdout="",
stderr="boom",
warnings=(warning,),
),
)
mineru_check = find_check(report, "mineru")
assert report.status == "warn"
assert mineru_check.status == "warn"
assert "version could not be detected" in mineru_check.message
def test_doctor_warns_when_mineru_version_is_not_target(tmp_path: Path) -> None:
report = make_report(
tmp_path,
mineru_result=MinerUVersionResult(
available=True,
version="mineru, version 3.1.8",
command=("mineru", "--version"),
exit_code=0,
stdout="mineru, version 3.1.8",
stderr="",
),
)
mineru_check = find_check(report, "mineru")
assert report.status == "warn"
assert mineru_check.status == "warn"
assert "project target is 3.1.0" in mineru_check.message
def test_doctor_warns_when_gpu_and_pytorch_are_missing(tmp_path: Path) -> None:
report = make_report(
tmp_path,
available_tools={"uv": "C:/Users/user/.local/bin/uv.exe"},
import_module=missing_torch,
)
assert report.status == "warn"
assert find_check(report, "gpu").status == "warn"
assert find_check(report, "pytorch").status == "warn"
def test_doctor_warns_for_gtx_1070_ti_pascal_risk(tmp_path: Path) -> None:
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
gpu_check = find_check(report, "gpu")
assert report.status == "warn"
assert gpu_check.status == "warn"
assert "Pascal/pre-Turing compatibility risk" in gpu_check.message
assert any("GTX 1070 Ti" in detail for detail in gpu_check.details)
def test_doctor_warns_for_pytorch_pre_turing_capability(tmp_path: Path) -> None:
def fake_pascal_torch(name: str) -> FakeTorch:
assert name == "torch"
return FakeTorch(FakeCuda(devices=("NVIDIA GeForce GTX 1070 Ti",), capabilities=((6, 1),)))
report = make_report(
tmp_path,
gpu_stdout="NVIDIA RTX 4060, 8192 MiB, 551.86\n",
import_module=fake_pascal_torch,
)
pytorch_check = find_check(report, "pytorch")
assert report.status == "warn"
assert pytorch_check.status == "warn"
assert "Pascal/pre-Turing compatibility risk" in pytorch_check.message
assert any("compute capability 6.1" in detail for detail in pytorch_check.details)
def test_doctor_warns_when_model_cache_is_not_detected(tmp_path: Path) -> None:
report = make_report(tmp_path, env={}, existing_paths=set())
models_check = find_check(report, "models")
assert report.status == "warn"
assert models_check.status == "warn"
assert "No MinerU model/cache/config path" in models_check.message
def test_doctor_warns_when_mathjax_node_is_missing(tmp_path: Path) -> None:
report = make_report(
tmp_path,
available_tools={
"uv": "C:/Users/user/.local/bin/uv.exe",
"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe",
},
)
mathjax_check = find_check(report, "mathjax")
assert report.status == "warn"
assert mathjax_check.status == "warn"
assert "Node.js executable was not found" in mathjax_check.message
def test_doctor_warns_when_mathjax_health_fails(tmp_path: Path) -> None:
def failing_runner(command: tuple[str, ...]) -> DoctorCommandResult:
if command[-1] == "--health":
return DoctorCommandResult(command, 1, stderr="Cannot find package 'mathjax'")
return command_runner("NVIDIA RTX 4060, 8192 MiB, 551.86\n")(command)
report = make_report(tmp_path, run_command=failing_runner)
mathjax_check = find_check(report, "mathjax")
assert report.status == "warn"
assert mathjax_check.status == "warn"
assert "unavailable" in mathjax_check.message
assert any("mathjax" in detail for detail in mathjax_check.details)
def test_format_doctor_report_is_stable(tmp_path: Path) -> None:
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
formatted = format_doctor_report(report)
assert formatted.startswith("Doctor status: WARN\n")
assert "[WARN] gpu:" in formatted
assert "[PASS] local-only:" in formatted
def make_report(
tmp_path: Path,
*,
python_version: tuple[int, int, int] = (3, 12, 7),
available_tools: dict[str, str] | None = None,
mineru_result: MinerUVersionResult | None = None,
gpu_stdout: str = "NVIDIA RTX 4060, 8192 MiB, 551.86\n",
env: dict[str, str] | None = None,
existing_paths: set[Path] | None = None,
import_module=None,
run_command=None,
) -> DoctorReport:
tools = available_tools or {
"uv": "C:/Users/user/.local/bin/uv.exe",
"nvidia-smi": "C:/Windows/System32/nvidia-smi.exe",
"node": "C:/Program Files/nodejs/node.exe",
}
result = mineru_result or MinerUVersionResult(
available=True,
version="mineru, version 3.1.0",
command=("mineru", "--version"),
exit_code=0,
stdout="mineru, version 3.1.0",
stderr="",
)
environment = env if env is not None else {"HF_HOME": str(tmp_path / "hf")}
paths = set(existing_paths if existing_paths is not None else {tmp_path / "hf"})
paths.add(default_mathjax_helper_path())
return run_doctor(
python_version=python_version,
which=lambda executable: tools.get(executable),
run_command=run_command or command_runner(gpu_stdout),
import_module=import_module or fake_torch,
env=environment,
path_exists=lambda path: path in paths,
home=tmp_path,
mineru_probe=FakeMinerUProbe(result),
)
def command_runner(gpu_stdout: str):
def run(command: tuple[str, ...]) -> DoctorCommandResult:
if command == ("uv", "--version"):
return DoctorCommandResult(command, 0, stdout="uv 0.8.13\n")
if command and command[0] == "nvidia-smi":
return DoctorCommandResult(command, 0, stdout=gpu_stdout)
if len(command) == 2 and command[1] == "--version" and command[0].endswith("node.exe"):
return DoctorCommandResult(command, 0, stdout="v24.13.0\n")
if command and command[-1] == "--health":
return DoctorCommandResult(command, 0, stdout='{"ok":true}\n')
return DoctorCommandResult(command, 127, stderr="not found")
return run
def fake_torch(name: str) -> FakeTorch:
assert name == "torch"
return FakeTorch(FakeCuda())
def missing_torch(name: str):
assert name == "torch"
raise ImportError(name)
def find_check(report: DoctorReport, name: str):
return next(check for check in report.checks if check.name == name)
+136
View File
@@ -0,0 +1,136 @@
from __future__ import annotations
import json
from pathlib import Path
import pytest
from pdf2md.ir import (
AssetRecord,
BlockRecord,
BlockType,
DocumentRecord,
PageRecord,
WarningCode,
WarningRecord,
WarningSeverity,
)
def test_record_serialization_preserves_present_optional_fields(tmp_path: Path) -> None:
block = BlockRecord(
BlockType.INLINE_FORMULA,
page_index=1,
bbox=(1.0, 2.0, 3.0, 4.0),
confidence=0.92,
markdown_span=(10, 20),
)
page = PageRecord(page_index=1, width=612, height=792, blocks=(block,))
asset = AssetRecord("paper.assets/image.png", page_index=1, bbox=(5.0, 6.0, 7.0, 8.0))
warning = WarningRecord(
WarningCode.LOW_CONFIDENCE_FORMULA,
WarningSeverity.WARNING,
"Formula confidence is low.",
page_index=1,
bbox=(1.0, 2.0, 3.0, 4.0),
)
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,), assets=(asset,), warnings=(warning,))
data = document.to_dict()
assert data["source_pdf"] == str(tmp_path / "paper.pdf")
assert data["pages"][0]["width"] == 612
assert data["pages"][0]["height"] == 792
assert data["pages"][0]["blocks"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
assert data["pages"][0]["blocks"][0]["confidence"] == 0.92
assert data["pages"][0]["blocks"][0]["markdown_span"] == [10, 20]
assert data["assets"][0]["bbox"] == [5.0, 6.0, 7.0, 8.0]
assert data["warnings"][0]["bbox"] == [1.0, 2.0, 3.0, 4.0]
json.dumps(data)
def test_record_serialization_omits_absent_optional_fields(tmp_path: Path) -> None:
block = BlockRecord(BlockType.PARAGRAPH)
page = PageRecord(page_index=0, blocks=(block,))
document = DocumentRecord(tmp_path / "paper.pdf", pages=(page,))
block_data = document.to_dict()["pages"][0]["blocks"][0]
page_data = document.to_dict()["pages"][0]
assert "page_index" not in block_data
assert "bbox" not in block_data
assert "confidence" not in block_data
assert "markdown_span" not in block_data
assert "width" not in page_data
assert "height" not in page_data
def test_block_types_and_warning_codes_match_architecture_set() -> None:
assert {item.value for item in BlockType} == {
"heading",
"paragraph",
"inline_formula",
"display_formula",
"table",
"figure",
"caption",
"footnote",
"reference",
"unknown",
}
assert {item.value for item in WarningCode} >= {
"ENGINE_MISSING",
"GPU_UNAVAILABLE",
"LOW_CONFIDENCE_FORMULA",
"MATH_RENDER_FAILED",
"ASSET_LINK_MISSING",
"READING_ORDER_UNCERTAIN",
"STRICT_LOCAL_VIOLATION",
"MINERU_CLI_FAILED",
}
@pytest.mark.parametrize("invalid_block_type", ["formula", "image"])
def test_invalid_block_type_fails_predictably(invalid_block_type: str) -> None:
with pytest.raises(ValueError, match="invalid block_type"):
BlockRecord(invalid_block_type) # type: ignore[arg-type]
@pytest.mark.parametrize("invalid_code", ["REMOTE_API_USED", "UNKNOWN_WARNING"])
def test_invalid_warning_code_fails_predictably(invalid_code: str) -> None:
with pytest.raises(ValueError, match="invalid code"):
WarningRecord(invalid_code, WarningSeverity.WARNING, "message") # type: ignore[arg-type]
@pytest.mark.parametrize("invalid_severity", ["fatal", "warn"])
def test_invalid_warning_severity_fails_predictably(invalid_severity: str) -> None:
with pytest.raises(ValueError, match="invalid severity"):
WarningRecord(WarningCode.MATH_RENDER_FAILED, invalid_severity, "message") # type: ignore[arg-type]
def test_empty_pages_are_rejected(tmp_path: Path) -> None:
with pytest.raises(ValueError, match="at least one page"):
DocumentRecord(tmp_path / "paper.pdf", pages=())
def test_empty_source_pdf_is_rejected() -> None:
with pytest.raises(ValueError, match="source_pdf"):
DocumentRecord("", pages=(PageRecord(page_index=0),))
def test_invalid_optional_fields_are_rejected() -> None:
with pytest.raises(ValueError, match="page_index"):
BlockRecord(BlockType.PARAGRAPH, page_index=-1)
with pytest.raises(ValueError, match="bbox"):
BlockRecord(BlockType.PARAGRAPH, bbox=(1.0, 2.0, 3.0)) # type: ignore[arg-type]
with pytest.raises(ValueError, match="confidence"):
BlockRecord(BlockType.PARAGRAPH, confidence=1.2)
with pytest.raises(ValueError, match="markdown_span"):
BlockRecord(BlockType.PARAGRAPH, markdown_span=(5, 3))
def test_asset_paths_must_be_relative() -> None:
with pytest.raises(ValueError, match="relative"):
AssetRecord("/absolute/image.png")
with pytest.raises(ValueError, match="relative"):
AssetRecord("../outside.png")
+159
View File
@@ -0,0 +1,159 @@
from __future__ import annotations
from pathlib import Path
import pytest
from pdf2md.ir import WarningCode
from pdf2md.markdown import normalize_markdown
def test_inline_parentheses_math_becomes_obsidian_dollars() -> None:
result = normalize_markdown(r"Area is \(x_i^2 + y^{2}\).")
assert result.markdown == r"Area is $x_i^2 + y^{2}$."
assert result.warnings == ()
def test_existing_dollar_math_and_currency_are_not_rewritten() -> None:
source = r"Cost is $5 and $10, while math $x_i^2$ stays."
result = normalize_markdown(source)
assert result.markdown == source
def test_display_bracket_math_gets_own_delimiter_lines_and_blank_lines() -> None:
result = normalize_markdown("Before\n\\[\na_i^2 + b^2\n\\]\nAfter")
assert result.markdown == "Before\n\n$$\na_i^2 + b^2\n$$\n\nAfter"
def test_display_environment_body_is_preserved_inside_delimiters() -> None:
source = "\\[\\begin{align}\na_i &= b^2\n\\end{align}\\]"
result = normalize_markdown(source)
assert result.markdown == "$$\n\\begin{align}\na_i &= b^2\n\\end{align}\n$$"
def test_existing_display_math_spacing_is_idempotent() -> None:
source = "Before\n$$\nx_i^2\n$$\nAfter"
once = normalize_markdown(source).markdown
twice = normalize_markdown(once).markdown
assert once == "Before\n\n$$\nx_i^2\n$$\n\nAfter"
assert twice == once
def test_underscores_carets_braces_and_backslashes_inside_math_are_preserved() -> None:
source = r"\(\frac{x_i^{2}}{\alpha_beta}\)"
result = normalize_markdown(source)
assert result.markdown == r"$\frac{x_i^{2}}{\alpha_beta}$"
def test_fenced_code_blocks_are_not_normalized() -> None:
source = "Text\n```md\n\\(x_i\\)\n\\[y\\]\n![alt](assets\\x.png)\n```\n\\(z\\)"
result = normalize_markdown(source)
assert result.markdown == "Text\n```md\n\\(x_i\\)\n\\[y\\]\n![alt](assets\\x.png)\n```\n$z$"
def test_inline_code_spans_are_not_normalized() -> None:
source = r"Keep `\(x_i\)` and convert \(y_i\)."
result = normalize_markdown(source)
assert result.markdown == r"Keep `\(x_i\)` and convert $y_i$."
def test_normalization_is_idempotent_for_mixed_content(tmp_path: Path) -> None:
(tmp_path / "assets").mkdir()
(tmp_path / "assets" / "fig 1.png").write_bytes(b"image")
source = "Before \\(x_i\\)\n\\[y^2\\]\n![fig](assets\\fig 1.png)"
once = normalize_markdown(source, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
twice = normalize_markdown(once.markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets", check_assets=True)
assert twice.markdown == once.markdown
assert twice.warnings == once.warnings
def test_relative_asset_links_use_posix_paths_and_preserve_alt_text() -> None:
result = normalize_markdown(r"![한글 caption](assets\fig 1.png)")
assert result.markdown == "![한글 caption](assets/fig 1.png)"
assert result.asset_links == ("assets/fig 1.png",)
assert result.warnings == ()
def test_missing_asset_link_emits_warning_when_checking_is_enabled(tmp_path: Path) -> None:
(tmp_path / "assets").mkdir()
result = normalize_markdown(
"![missing](assets/missing.png)",
markdown_dir=tmp_path,
asset_root=tmp_path / "assets",
check_assets=True,
)
assert result.markdown == "![missing](assets/missing.png)"
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING]
@pytest.mark.parametrize(
("source", "expected_link"),
[
(r"![absolute](C:\tmp\fig.png)", "fig.png"),
("![escape](../outside.png)", "outside.png"),
],
)
def test_invalid_local_asset_links_are_rewritten_as_relative_with_warning(source: str, expected_link: str) -> None:
result = normalize_markdown(source)
assert result.markdown.endswith(f"({expected_link})")
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
def test_remote_asset_link_is_warned_and_not_fetched_or_rewritten() -> None:
source = "![remote](https://example.test/fig.png)"
result = normalize_markdown(source)
assert result.markdown == source
assert result.asset_links == ("https://example.test/fig.png",)
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
def test_absolute_asset_under_markdown_dir_can_be_rewritten_relative_with_warning(tmp_path: Path) -> None:
asset_dir = tmp_path / "assets"
asset_dir.mkdir()
asset = asset_dir / "fig.png"
asset.write_bytes(b"image")
result = normalize_markdown(f"![fig]({asset})", markdown_dir=tmp_path, asset_root=asset_dir, check_assets=True)
assert result.markdown == "![fig](assets/fig.png)"
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID]
def test_simple_pipe_table_is_preserved() -> None:
source = "| A | B |\n|---|---|\n| \\(x\\) | y |"
result = normalize_markdown(source)
assert result.markdown == source
assert result.warnings == ()
def test_complex_html_table_is_preserved_with_fallback_warning() -> None:
source = '<table><tr><td rowspan="2">\\(x_i\\)</td><td>y</td></tr></table>'
result = normalize_markdown(source)
assert result.markdown == source
assert [warning.code for warning in result.warnings] == [WarningCode.TABLE_FALLBACK]
+118
View File
@@ -0,0 +1,118 @@
from __future__ import annotations
import json
from pathlib import Path
import pytest
from pdf2md.math_render import MathJaxCommandResult, MathJaxRenderChecker
from pdf2md.quality import MathCheckerUnavailable, MathExpression
def test_mathjax_checker_batches_expressions_as_json(tmp_path: Path) -> None:
helper = make_helper(tmp_path)
calls = []
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
calls.append((command, json.loads(stdin), timeout_seconds))
return MathJaxCommandResult(
command,
0,
stdout=json.dumps(
{
"results": [
{"index": 0, "ok": True},
{"index": 1, "ok": False, "message": "Undefined control sequence"},
]
}
),
)
checker = MathJaxRenderChecker(
helper_path=helper,
which=lambda executable: "C:/node/node.exe" if executable == "node" else None,
runner=runner,
timeout_seconds=7,
)
expressions = (
MathExpression(0, "x_i^2", False, (0, 7)),
MathExpression(1, "\\bad", True, (9, 18)),
)
results = checker.check_expressions(expressions)
assert [result.ok for result in results] == [True, False]
assert results[1].message == "Undefined control sequence"
assert calls == [
(
("C:/node/node.exe", str(helper)),
{
"expressions": [
{"index": 0, "body": "x_i^2", "display": False},
{"index": 1, "body": "\\bad", "display": True},
]
},
7,
)
]
def test_mathjax_checker_reports_missing_node_as_unavailable(tmp_path: Path) -> None:
checker = MathJaxRenderChecker(helper_path=make_helper(tmp_path), which=lambda _: None)
with pytest.raises(MathCheckerUnavailable, match="Node.js"):
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
def test_mathjax_checker_reports_helper_failure_as_unavailable(tmp_path: Path) -> None:
helper = make_helper(tmp_path)
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
return MathJaxCommandResult(command, 124, stderr="MathJax helper timed out")
checker = MathJaxRenderChecker(
helper_path=helper,
which=lambda _: "node",
runner=runner,
)
with pytest.raises(MathCheckerUnavailable, match="timed out"):
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
def test_mathjax_checker_reports_invalid_json_as_unavailable(tmp_path: Path) -> None:
helper = make_helper(tmp_path)
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
return MathJaxCommandResult(command, 0, stdout="not json")
checker = MathJaxRenderChecker(
helper_path=helper,
which=lambda _: "node",
runner=runner,
)
with pytest.raises(MathCheckerUnavailable, match="invalid JSON"):
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
def test_mathjax_checker_rejects_mismatched_result_indexes(tmp_path: Path) -> None:
helper = make_helper(tmp_path)
def runner(command: tuple[str, ...], stdin: str, timeout_seconds: int) -> MathJaxCommandResult:
return MathJaxCommandResult(command, 0, stdout=json.dumps({"results": [{"index": 99, "ok": True}]}))
checker = MathJaxRenderChecker(
helper_path=helper,
which=lambda _: "node",
runner=runner,
)
with pytest.raises(MathCheckerUnavailable, match="indexes"):
checker.check_expressions((MathExpression(0, "x", False, (0, 3)),))
def make_helper(tmp_path: Path) -> Path:
helper = tmp_path / "check.mjs"
helper.write_text("// fake helper", encoding="utf-8")
return helper
+173
View File
@@ -0,0 +1,173 @@
from __future__ import annotations
import json
from pathlib import Path
import pytest
from pdf2md.ir import (
AssetRecord,
BlockRecord,
BlockType,
DocumentRecord,
PageRecord,
WarningCode,
WarningRecord,
WarningSeverity,
)
from pdf2md.metadata import MetadataInputError, build_metadata, build_summary
def make_document(tmp_path: Path) -> DocumentRecord:
page_zero = PageRecord(
page_index=0,
blocks=(
BlockRecord(BlockType.HEADING, page_index=0),
BlockRecord(BlockType.INLINE_FORMULA, page_index=0, confidence=0.98),
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0, bbox=(1.0, 2.0, 3.0, 4.0)),
),
)
page_one = PageRecord(
page_index=1,
blocks=(
BlockRecord(BlockType.PARAGRAPH, page_index=1),
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=1),
),
)
return DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(page_zero, page_one),
assets=(AssetRecord("paper.assets/figure.png", page_index=1),),
warnings=(
WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Check reading order.", page_index=1),
WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Math failed to render.", page_index=0),
),
)
def build_test_metadata(tmp_path: Path) -> dict[str, object]:
return build_metadata(
document=make_document(tmp_path),
source_sha256="0" * 64,
created_at="2026-05-07T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
engine_options={"strict_local": True},
)
def test_metadata_has_required_top_level_fields(tmp_path: Path) -> None:
metadata = build_test_metadata(tmp_path)
assert set(metadata) == {
"source_pdf",
"source_sha256",
"created_at",
"engine",
"engine_version",
"engine_options",
"pages",
"assets",
"warnings",
"summary",
}
def test_metadata_summary_counts_from_records(tmp_path: Path) -> None:
metadata = build_test_metadata(tmp_path)
assert metadata["summary"] == {
"pages_processed": 2,
"warning_count": 2,
"asset_count": 1,
"display_formula_count": 2,
"inline_formula_count": 1,
"math_render_error_count": 1,
}
def test_warning_order_and_page_provenance_are_preserved(tmp_path: Path) -> None:
metadata = build_test_metadata(tmp_path)
warnings = metadata["warnings"]
assert [warning["code"] for warning in warnings] == [
"READING_ORDER_UNCERTAIN",
"MATH_RENDER_FAILED",
]
assert warnings[0]["page_index"] == 1
assert warnings[1]["page_index"] == 0
def test_optional_bbox_and_confidence_are_preserved_only_when_present(tmp_path: Path) -> None:
metadata = build_test_metadata(tmp_path)
blocks = metadata["pages"][0]["blocks"]
assert "confidence" not in blocks[0]
assert blocks[1]["confidence"] == 0.98
assert "bbox" not in blocks[1]
assert blocks[2]["bbox"] == [1.0, 2.0, 3.0, 4.0]
def test_metadata_is_json_serializable(tmp_path: Path) -> None:
json.dumps(build_test_metadata(tmp_path))
@pytest.mark.parametrize(
("field_name", "kwargs"),
[
("document", {"document": None}),
("source_sha256", {"source_sha256": ""}),
("created_at", {"created_at": ""}),
("engine", {"engine": ""}),
("engine_version", {"engine_version": ""}),
],
)
def test_metadata_requires_core_inputs(tmp_path: Path, field_name: str, kwargs: dict[str, object]) -> None:
values: dict[str, object] = {
"document": make_document(tmp_path),
"source_sha256": "0" * 64,
"created_at": "2026-05-07T00:00:00Z",
"engine": "MinerU",
"engine_version": "3.1.0",
}
values.update(kwargs)
with pytest.raises(MetadataInputError, match=field_name):
build_metadata(**values)
def test_engine_options_must_be_json_serializable(tmp_path: Path) -> None:
with pytest.raises(MetadataInputError, match="JSON serializable"):
build_metadata(
document=make_document(tmp_path),
source_sha256="0" * 64,
created_at="2026-05-07T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
engine_options={"path": tmp_path},
)
def test_formula_counts_come_from_block_types_not_markdown_text(tmp_path: Path) -> None:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH), BlockRecord(BlockType.UNKNOWN))),),
)
summary = build_summary(document)
assert summary["inline_formula_count"] == 0
assert summary["display_formula_count"] == 0
def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path) -> None:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.INLINE_FORMULA),)),),
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.INFO, "Checker unavailable."),),
)
summary = build_summary(document)
assert summary["warning_count"] == 1
assert summary["math_render_error_count"] == 0
+264
View File
@@ -0,0 +1,264 @@
from __future__ import annotations
import os
from pathlib import Path
import pytest
from pdf2md.ir import WarningCode
from pdf2md.mineru_adapter import (
CommandResult,
MinerUAdapter,
MinerUOptions,
StrictLocalViolationError,
)
class FakeRunner:
def __init__(self, *results: CommandResult) -> None:
self.results = list(results)
self.commands: list[tuple[str, ...]] = []
def __call__(self, command: tuple[str, ...]) -> CommandResult:
self.commands.append(command)
if not self.results:
raise AssertionError("fake runner was called without a queued result")
result = self.results.pop(0)
return CommandResult(
command=command,
exit_code=result.exit_code,
stdout=result.stdout,
stderr=result.stderr,
)
class EnvironmentRunner:
def __init__(self) -> None:
self.mineru_device_mode: str | None = None
self.cuda_visible_devices: str | None = None
def __call__(self, command: tuple[str, ...]) -> CommandResult:
self.mineru_device_mode = os.environ.get("MINERU_DEVICE_MODE")
self.cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
work_dir = Path(command[command.index("-o") + 1])
work_dir.mkdir(parents=True, exist_ok=True)
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
return CommandResult(command=command, exit_code=0)
def available(_: str) -> str:
return "C:/local/bin/mineru.exe"
def missing(_: str) -> None:
return None
def test_availability_check_uses_mockable_which() -> None:
assert MinerUAdapter(which=available, runner=FakeRunner()).is_available() is True
assert MinerUAdapter(which=missing, runner=FakeRunner()).is_available() is False
@pytest.mark.parametrize("executable", ["mineru-api", "python", "C:/tools/mineru.exe"])
def test_custom_executable_is_rejected(executable: str) -> None:
with pytest.raises(StrictLocalViolationError):
MinerUAdapter(executable=executable, which=available, runner=FakeRunner())
def test_missing_mineru_does_not_call_runner(tmp_path: Path) -> None:
runner = FakeRunner()
adapter = MinerUAdapter(which=missing, runner=runner)
result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work")
assert result.succeeded is False
assert result.exit_code is None
assert runner.commands == []
assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING]
def test_missing_mineru_version_does_not_call_runner() -> None:
runner = FakeRunner()
adapter = MinerUAdapter(which=missing, runner=runner)
result = adapter.version()
assert result.available is False
assert result.exit_code is None
assert runner.commands == []
assert [warning.code for warning in result.warnings] == [WarningCode.ENGINE_MISSING]
def test_version_success_uses_stdout() -> None:
runner = FakeRunner(CommandResult((), 0, stdout="MinerU 3.1.0\n"))
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.version()
assert result.available is True
assert result.version == "MinerU 3.1.0"
assert result.command == ("mineru", "--version")
assert runner.commands == [("mineru", "--version")]
def test_version_success_can_use_stderr() -> None:
runner = FakeRunner(CommandResult((), 0, stderr="MinerU 3.1.0\n"))
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.version()
assert result.version == "MinerU 3.1.0"
def test_version_failure_is_explicit() -> None:
runner = FakeRunner(CommandResult((), 2, stdout="", stderr="bad version"))
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.version()
assert result.version is None
assert result.exit_code == 2
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
def test_version_empty_output_is_explicit() -> None:
runner = FakeRunner(CommandResult((), 0, stdout="", stderr=""))
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.version()
assert result.available is True
assert result.version is None
assert result.exit_code == 0
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
def test_build_command_is_list_based_and_deterministic(tmp_path: Path) -> None:
adapter = MinerUAdapter(which=available, runner=FakeRunner())
input_pdf = tmp_path / "논문 with spaces.pdf"
work_dir = tmp_path / "work output"
command = adapter.build_command(input_pdf, work_dir)
assert command == ("mineru", "-p", str(input_pdf), "-o", str(work_dir))
assert "--api-url" not in command
@pytest.mark.parametrize(
"options",
[
MinerUOptions(extra_cli_args=("--api-url", "http://example.test")),
MinerUOptions(engine_options={"api_url": "http://example.test"}),
MinerUOptions(engine_options={"base_url": "http://example.test"}),
MinerUOptions(engine_options={"mode": "router"}),
MinerUOptions(engine_options={"backend": "http"}),
MinerUOptions(engine_options={"openai_base_url": "http://example.test/v1"}),
MinerUOptions(engine_options={"endpoint": "https://example.test"}),
MinerUOptions(engine_options={"nested": {"url": "local http://example.test"}}),
MinerUOptions(engine_options={"process": "mineru-api"}),
MinerUOptions(gpu_device="https://example.test/gpu"),
MinerUOptions(strict_local=False),
],
)
def test_strict_local_rejects_remote_router_and_backend_options(tmp_path: Path, options: MinerUOptions) -> None:
adapter = MinerUAdapter(which=available, runner=FakeRunner())
with pytest.raises(StrictLocalViolationError):
adapter.build_command(tmp_path / "paper.pdf", tmp_path / "work", options)
def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path) -> None:
work_dir = tmp_path / "work"
(work_dir / "nested").mkdir(parents=True)
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
(work_dir / "structured.json").write_text('{"pages": 1}', encoding="utf-8")
(work_dir / "assets" / "z.png").parent.mkdir()
(work_dir / "assets" / "z.png").write_bytes(b"z")
(work_dir / "assets" / "a.png").write_bytes(b"a")
(work_dir / "assets" / "nested").mkdir()
(work_dir / "assets" / "nested" / "b.png").write_bytes(b"b")
(work_dir / "zz_extra.md").write_text("not an asset", encoding="utf-8")
(work_dir / "zz_extra.json").write_text("{}", encoding="utf-8")
(work_dir / "run.log").write_text("diagnostic", encoding="utf-8")
runner = FakeRunner(CommandResult((), 0, stdout="ok", stderr="warn"))
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.convert(
tmp_path / "paper.pdf",
work_dir,
MinerUOptions(engine_version="3.1.0", gpu_device="cuda:0"),
)
assert result.succeeded is True
assert result.command == ("mineru", "-p", str(tmp_path / "paper.pdf"), "-o", str(work_dir))
assert result.raw_markdown == "# Title\n"
assert result.raw_structured == {"pages": 1}
assert [path.relative_to(work_dir).as_posix() for path in result.asset_paths] == [
"assets/a.png",
"assets/nested/b.png",
"assets/z.png",
]
assert result.engine == "MinerU"
assert result.engine_version == "3.1.0"
assert result.engine_options == {"strict_local": True, "gpu_device": "cuda:0"}
assert result.exit_code == 0
assert result.stdout == "ok"
assert result.stderr == "warn"
def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_path: Path, monkeypatch) -> None:
monkeypatch.setenv("MINERU_DEVICE_MODE", "cpu")
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "7")
runner = EnvironmentRunner()
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.convert(tmp_path / "paper.pdf", tmp_path / "work", MinerUOptions(gpu_device="cuda:0"))
assert result.succeeded is True
assert runner.mineru_device_mode == "cuda"
assert runner.cuda_visible_devices == "0"
assert os.environ["MINERU_DEVICE_MODE"] == "cpu"
assert os.environ["CUDA_VISIBLE_DEVICES"] == "7"
def test_nonzero_exit_does_not_parse_existing_outputs_or_fallback(tmp_path: Path) -> None:
work_dir = tmp_path / "work"
work_dir.mkdir()
(work_dir / "paper.md").write_text("existing output", encoding="utf-8")
runner = FakeRunner(CommandResult((), 3, stdout="out", stderr="failed"))
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
assert result.succeeded is False
assert result.raw_markdown is None
assert result.asset_paths == ()
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
def test_exit_zero_with_no_usable_output_warns(tmp_path: Path) -> None:
work_dir = tmp_path / "work"
work_dir.mkdir()
runner = FakeRunner(CommandResult((), 0))
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
assert result.succeeded is False
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
assert "no usable" in result.warnings[0].message
def test_invalid_json_is_preserved_as_text_with_warning(tmp_path: Path) -> None:
work_dir = tmp_path / "work"
work_dir.mkdir()
(work_dir / "paper.md").write_text("markdown", encoding="utf-8")
(work_dir / "structured.json").write_text("{not json", encoding="utf-8")
runner = FakeRunner(CommandResult((), 0))
adapter = MinerUAdapter(which=available, runner=runner)
result = adapter.convert(tmp_path / "paper.pdf", work_dir)
assert result.succeeded is True
assert result.raw_structured == "{not json"
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
+8
View File
@@ -0,0 +1,8 @@
from __future__ import annotations
import pdf2md
def test_package_imports() -> None:
assert pdf2md.__version__ == "0.1.0"
assert callable(pdf2md.convert_pdf)
+188
View File
@@ -0,0 +1,188 @@
from __future__ import annotations
import os
from pathlib import Path
import pytest
from pdf2md.paths import (
DiscoveredPdf,
DuplicateOutputPathError,
InputDiscoveryError,
OutputConflictError,
OutputPathError,
OutputRootError,
discover_pdfs,
plan_outputs,
plan_pdf_outputs,
)
def touch(path: Path) -> Path:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(b"")
return path
def test_discovers_single_pdf_case_insensitive(tmp_path: Path) -> None:
pdf = touch(tmp_path / "Paper.PDF")
discovered = discover_pdfs(pdf)
assert discovered == (DiscoveredPdf(source_path=pdf.resolve()),)
def test_rejects_nonexistent_and_non_pdf_inputs(tmp_path: Path) -> None:
with pytest.raises(InputDiscoveryError, match="does not exist"):
discover_pdfs(tmp_path / "missing.pdf")
text_file = touch(tmp_path / "notes.txt")
with pytest.raises(InputDiscoveryError, match="not a PDF"):
discover_pdfs(text_file)
def test_discovers_directory_non_recursive_only(tmp_path: Path) -> None:
root_pdf = touch(tmp_path / "root.pdf")
nested_pdf = touch(tmp_path / "nested" / "child.pdf")
discovered = discover_pdfs(tmp_path, recursive=False)
assert [item.source_path for item in discovered] == [root_pdf.resolve()]
assert nested_pdf.resolve() not in {item.source_path for item in discovered}
def test_non_recursive_directory_with_only_nested_pdfs_fails(tmp_path: Path) -> None:
touch(tmp_path / "nested" / "child.pdf")
with pytest.raises(InputDiscoveryError, match="no PDF files"):
discover_pdfs(tmp_path, recursive=False)
def test_discovers_directory_recursive_with_relative_parents(tmp_path: Path) -> None:
root_pdf = touch(tmp_path / "root.pdf")
nested_pdf = touch(tmp_path / "nested" / "child.pdf")
deeper_pdf = touch(tmp_path / "nested" / "deeper" / "leaf.PdF")
discovered = discover_pdfs(tmp_path, recursive=True)
assert [(item.source_path, item.relative_parent) for item in discovered] == [
(nested_pdf.resolve(), Path("nested")),
(deeper_pdf.resolve(), Path("nested") / "deeper"),
(root_pdf.resolve(), Path()),
]
def test_discovery_order_is_deterministic_for_non_ascii_names(tmp_path: Path) -> None:
touch(tmp_path / "한글.pdf")
touch(tmp_path / "Alpha.pdf")
touch(tmp_path / "beta.PDF")
first = discover_pdfs(tmp_path)
second = discover_pdfs(tmp_path)
assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", "한글.pdf"]
assert first == second
def test_plans_all_default_output_paths_for_single_pdf(tmp_path: Path) -> None:
pdf = touch(tmp_path / "입력.pdf")
output_root = tmp_path / "out"
[plan] = plan_pdf_outputs(pdf, output_root)
assert plan.source_pdf == pdf.resolve()
assert plan.markdown_path == output_root.resolve() / "입력.md"
assert plan.assets_dir == output_root.resolve() / "입력.assets"
assert plan.metadata_path == output_root.resolve() / "입력.metadata.json"
assert plan.report_path == output_root.resolve() / "입력.report.md"
assert plan.raw_dir is None
def test_plans_optional_metadata_and_raw_outputs(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
[without_metadata] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=False)
[with_raw] = plan_pdf_outputs(pdf, tmp_path / "out", keep_raw=True)
assert without_metadata.metadata_path is None
assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper.report.md"
assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper.raw"
def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) -> None:
root = tmp_path / "pdfs"
touch(root / "same.pdf")
touch(root / "nested" / "same.pdf")
plans = plan_pdf_outputs(root, tmp_path / "out", recursive=True)
assert [plan.markdown_path.relative_to((tmp_path / "out").resolve()) for plan in plans] == [
Path("nested") / "same.md",
Path("same.md"),
]
def test_non_recursive_duplicate_output_paths_fail(tmp_path: Path) -> None:
first = touch(tmp_path / "first" / "same.pdf")
second = touch(tmp_path / "second" / "same.pdf")
discovered = (
DiscoveredPdf(source_path=first.resolve()),
DiscoveredPdf(source_path=second.resolve()),
)
with pytest.raises(DuplicateOutputPathError, match="duplicated"):
plan_outputs(discovered, tmp_path / "out")
def test_output_conflicts_report_all_existing_paths(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
output_root = tmp_path / "out"
(output_root / "paper.assets").mkdir(parents=True)
(output_root / "paper.md").mkdir()
touch(output_root / "paper.metadata.json")
with pytest.raises(OutputConflictError) as error:
plan_pdf_outputs(pdf, output_root)
conflict_names = {path.name for path in error.value.conflicts}
assert conflict_names == {"paper.assets", "paper.md", "paper.metadata.json"}
def test_overwrite_allows_existing_paths_without_deleting(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
output_root = tmp_path / "out"
existing = touch(output_root / "paper.md")
[plan] = plan_pdf_outputs(pdf, output_root, overwrite=True)
assert plan.markdown_path == existing.resolve()
assert existing.exists()
def test_output_root_cannot_be_existing_file(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
output_root = touch(tmp_path / "out")
with pytest.raises(OutputRootError, match="not a directory"):
plan_pdf_outputs(pdf, output_root)
def test_planned_paths_cannot_escape_output_root(tmp_path: Path) -> None:
pdf = touch(tmp_path / "paper.pdf")
discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=Path("..")),)
with pytest.raises(OutputPathError, match="escape"):
plan_outputs(discovered, tmp_path / "out")
@pytest.mark.skipif(os.name != "nt", reason="Windows rooted path behavior")
@pytest.mark.parametrize("relative_parent", [Path("\\outside"), Path("/outside"), Path("C:outside")])
def test_windows_rooted_relative_parents_cannot_escape_output_root(
tmp_path: Path,
relative_parent: Path,
) -> None:
pdf = touch(tmp_path / "paper.pdf")
discovered = (DiscoveredPdf(source_path=pdf.resolve(), relative_parent=relative_parent),)
with pytest.raises(OutputPathError, match="escape"):
plan_outputs(discovered, tmp_path / "out")
+62
View File
@@ -0,0 +1,62 @@
from __future__ import annotations
from pathlib import Path
import pytest
from pypdf import PdfReader, PdfWriter
from pdf2md.pdf_splitter import PdfChunkError, count_pdf_pages, plan_pdf_chunks, write_pdf_chunk
def make_blank_pdf(path: Path, page_count: int) -> Path:
path.parent.mkdir(parents=True, exist_ok=True)
writer = PdfWriter()
for _ in range(page_count):
writer.add_blank_page(width=72, height=72)
with path.open("wb") as file:
writer.write(file)
return path
@pytest.mark.parametrize(
("page_count", "expected_ranges"),
[
(1, [(1, 1)]),
(20, [(1, 20)]),
(21, [(1, 20), (21, 21)]),
(40, [(1, 20), (21, 40)]),
(41, [(1, 20), (21, 40), (41, 41)]),
],
)
def test_plan_pdf_chunks_uses_one_based_ranges_and_names(
tmp_path: Path,
page_count: int,
expected_ranges: list[tuple[int, int]],
) -> None:
pdf = make_blank_pdf(tmp_path / "paper.pdf", page_count)
chunks = plan_pdf_chunks(pdf, chunk_pages=20)
assert count_pdf_pages(pdf) == page_count
assert [(chunk.source_page_start, chunk.source_page_end) for chunk in chunks] == expected_ranges
assert [chunk.output_filename for chunk in chunks] == [
f"paper.part-{index:03d}.pages-{start:03d}-{end:03d}.pdf"
for index, (start, end) in enumerate(expected_ranges, start=1)
]
def test_write_pdf_chunk_writes_expected_page_count(tmp_path: Path) -> None:
pdf = make_blank_pdf(tmp_path / "paper.pdf", 41)
chunk = plan_pdf_chunks(pdf, chunk_pages=20)[1]
output = write_pdf_chunk(chunk, tmp_path / "chunks" / chunk.output_filename)
assert output.exists()
assert len(PdfReader(output).pages) == 20
def test_plan_pdf_chunks_rejects_non_positive_chunk_size(tmp_path: Path) -> None:
pdf = make_blank_pdf(tmp_path / "paper.pdf", 1)
with pytest.raises(PdfChunkError, match="positive integer"):
plan_pdf_chunks(pdf, chunk_pages=0)
+144
View File
@@ -0,0 +1,144 @@
from __future__ import annotations
from pathlib import Path
from pdf2md.ir import WarningCode, WarningSeverity
from pdf2md.quality import (
MathCheckerUnavailable,
MathCheckResult,
check_asset_links,
check_math_renderability,
extract_math_expressions,
merge_quality_results,
)
def test_missing_asset_link_is_counted(tmp_path: Path) -> None:
asset_root = tmp_path / "assets"
asset_root.mkdir()
result = check_asset_links("![missing](assets/missing.png)", markdown_dir=tmp_path, asset_root=asset_root)
assert result.missing_asset_link_count == 1
assert result.invalid_asset_link_count == 0
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_MISSING]
def test_existing_asset_link_passes_without_warning(tmp_path: Path) -> None:
asset_root = tmp_path / "assets"
asset_root.mkdir()
(asset_root / "fig.png").write_bytes(b"image")
result = check_asset_links("![fig](assets/fig.png)", markdown_dir=tmp_path, asset_root=asset_root)
assert result.failure_count == 0
assert result.warnings == ()
def test_invalid_asset_links_are_counted_without_fetching(tmp_path: Path) -> None:
markdown = "\n".join(
[
"![remote](https://example.test/fig.png)",
"![escape](../outside.png)",
r"![absolute](C:\tmp\fig.png)",
]
)
result = check_asset_links(markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets")
assert result.invalid_asset_link_count == 3
assert result.missing_asset_link_count == 0
assert [warning.code for warning in result.warnings] == [WarningCode.ASSET_LINK_INVALID] * 3
def test_asset_links_inside_code_are_ignored(tmp_path: Path) -> None:
markdown = "```md\n![missing](assets/missing.png)\n```\n`![missing](assets/inline.png)`"
result = check_asset_links(markdown, markdown_dir=tmp_path, asset_root=tmp_path / "assets")
assert result.failure_count == 0
assert result.warnings == ()
def test_math_render_failures_are_aggregated_with_fake_checker() -> None:
def checker(body: str) -> MathCheckResult:
return MathCheckResult(ok="bad" not in body, message=f"{body} failed")
result = check_math_renderability("$x_i^2$\n\n$$\nbad_math\n$$", checker)
assert result.math_render_error_count == 1
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_FAILED]
assert "bad_math failed" in result.warnings[0].message
def test_math_extraction_records_display_mode_and_markdown_spans() -> None:
markdown = "Inline $x_i^2$ before\n\n$$\n\\frac{1}{2}\n$$\n"
expressions = extract_math_expressions(markdown)
assert [(expression.index, expression.body, expression.display) for expression in expressions] == [
(0, "x_i^2", False),
(1, "\\frac{1}{2}", True),
]
assert [markdown[start:end] for start, end in (expression.markdown_span for expression in expressions)] == [
"$x_i^2$",
"$$\n\\frac{1}{2}\n$$",
]
def test_math_extraction_ignores_code_and_currency_like_text() -> None:
markdown = "```tex\n$x$\n```\n`$y$`\nPrice $12.00$ and real $z$."
expressions = extract_math_expressions(markdown)
assert [(expression.body, expression.display) for expression in expressions] == [("z", False)]
def test_batch_math_checker_receives_expression_records() -> None:
class BatchChecker:
def __init__(self) -> None:
self.expressions = ()
def check_expressions(self, expressions):
self.expressions = expressions
return tuple(MathCheckResult(ok=expression.display) for expression in expressions)
checker = BatchChecker()
result = check_math_renderability("$inline$\n\n$$\ndisplay\n$$", checker)
assert [expression.body for expression in checker.expressions] == ["inline", "display"]
assert result.math_render_error_count == 1
assert "inline" in result.warnings[0].message
def test_math_checker_unavailable_is_nonfatal() -> None:
def checker(_: str) -> bool:
raise MathCheckerUnavailable("local renderer missing")
result = check_math_renderability("$x$", checker)
assert result.math_render_error_count == 0
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
assert result.warnings[0].severity == WarningSeverity.INFO
def test_missing_math_checker_is_explicit_and_nonfatal() -> None:
result = check_math_renderability("$x$")
assert result.math_render_error_count == 0
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
assert result.warnings[0].severity == WarningSeverity.INFO
def test_merge_quality_results_combines_counts_and_warning_order(tmp_path: Path) -> None:
asset_result = check_asset_links("![missing](assets/missing.png)", markdown_dir=tmp_path)
math_result = check_math_renderability("$x$", lambda _: False)
result = merge_quality_results(asset_result, math_result)
assert result.missing_asset_link_count == 1
assert result.math_render_error_count == 1
assert [warning.code for warning in result.warnings] == [
WarningCode.ASSET_LINK_MISSING,
WarningCode.MATH_RENDER_FAILED,
]
+163
View File
@@ -0,0 +1,163 @@
from __future__ import annotations
from pathlib import Path
from pdf2md.ir import (
AssetRecord,
BlockRecord,
BlockType,
DocumentRecord,
PageRecord,
WarningCode,
WarningRecord,
WarningSeverity,
)
from pdf2md.metadata import build_metadata
from pdf2md.quality import QualityResult
from pdf2md.report import determine_final_status, pages_with_warnings, render_report
def make_metadata(tmp_path: Path, *, warnings: tuple[WarningRecord, ...] = ()) -> dict[str, object]:
document = DocumentRecord(
source_pdf=tmp_path / "paper.pdf",
pages=(
PageRecord(
page_index=0,
blocks=(
BlockRecord(BlockType.INLINE_FORMULA, page_index=0),
BlockRecord(BlockType.DISPLAY_FORMULA, page_index=0),
),
),
PageRecord(page_index=1, blocks=(BlockRecord(BlockType.PARAGRAPH, page_index=1),)),
),
assets=(AssetRecord("paper.assets/fig.png", page_index=1),),
warnings=warnings,
)
return build_metadata(
document=document,
source_sha256="0" * 64,
created_at="2026-05-08T00:00:00Z",
engine="MinerU",
engine_version="3.1.0",
engine_options={"strict_local": True},
)
def test_final_status_success_partial_and_failed(tmp_path: Path) -> None:
success_metadata = make_metadata(tmp_path)
warning_metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
)
failed_metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
)
assert determine_final_status(success_metadata) == "success"
assert determine_final_status(warning_metadata) == "partial"
assert determine_final_status(success_metadata, QualityResult(missing_asset_link_count=1)) == "partial"
assert determine_final_status(failed_metadata) == "failed"
def test_pages_with_warnings_are_sorted_and_derived_from_metadata_and_quality(tmp_path: Path) -> None:
metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
)
quality = QualityResult(
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.WARNING, "Math failed.", page_index=0),)
)
assert pages_with_warnings(metadata, quality) == (0, 1)
def test_report_content_includes_required_sections_and_counts(tmp_path: Path) -> None:
metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.READING_ORDER_UNCERTAIN, WarningSeverity.WARNING, "Review page.", page_index=1),),
)
quality = QualityResult(
missing_asset_link_count=2,
invalid_asset_link_count=1,
math_render_error_count=3,
warnings=(WarningRecord(WarningCode.ASSET_LINK_MISSING, WarningSeverity.WARNING, "Missing asset."),),
)
report = render_report(
metadata,
quality=quality,
markdown_path=tmp_path / "paper.md",
metadata_path=tmp_path / "paper.metadata.json",
report_path=tmp_path / "paper.report.md",
)
assert "# PDF-to-Markdown Quality Report" in report
assert "- Final status: `partial`" in report
assert f"- Source PDF: {tmp_path / 'paper.pdf'}" in report
assert f"- Output Markdown: {tmp_path / 'paper.md'}" in report
assert "- Engine: MinerU" in report
assert "- Engine version: 3.1.0" in report
assert '- Engine options: `{"strict_local": true}`' in report
assert "- Pages processed: 2" in report
assert "- Warning count: 2" in report
assert "- Asset count: 1" in report
assert "- Missing asset link count: 2" in report
assert "- Invalid asset link count: 1" in report
assert "- Inline formula count: 1" in report
assert "- Display formula count: 1" in report
assert "- Math render error count: 3" in report
assert "- Page 1" in report
assert "`ASSET_LINK_MISSING`" in report
def test_report_omits_absent_optional_paths_and_does_not_write_files(tmp_path: Path) -> None:
metadata = make_metadata(tmp_path)
report_path = tmp_path / "paper.report.md"
report = render_report(metadata)
assert "Output Markdown:" not in report
assert "Metadata JSON:" not in report
assert "Report Markdown:" not in report
assert not report_path.exists()
def test_report_failed_status_comes_from_error_severity_warning(tmp_path: Path) -> None:
metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed."),),
)
report = render_report(metadata)
assert "- Final status: `failed`" in report
def test_report_uses_metadata_math_render_count_plus_quality_count(tmp_path: Path) -> None:
metadata = make_metadata(
tmp_path,
warnings=(WarningRecord(WarningCode.MATH_RENDER_FAILED, WarningSeverity.ERROR, "Metadata math failed."),),
)
quality = QualityResult(math_render_error_count=2)
report = render_report(metadata, quality=quality)
assert "- Math render error count: 3" in report
def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path: Path) -> None:
metadata = make_metadata(tmp_path)
metadata["engine_options"] = {
"strict_local": True,
"chunk": {
"chunk_index": 2,
"total_chunks": 3,
"source_page_start": 21,
"source_page_end": 40,
},
}
report = render_report(metadata)
assert "- Chunk: 2/3, source pages: 21-40" in report