modify pdftomd
This commit is contained in:
@@ -39,7 +39,6 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
|
||||
output_root = tmp_path / "mineru-fixture-output"
|
||||
attempts: list[dict[str, object]] = []
|
||||
for pdf in sample_pdfs:
|
||||
sample_output = output_root / pdf.stem
|
||||
completed = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
@@ -48,7 +47,7 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
str(output_root),
|
||||
],
|
||||
cwd=REPO_ROOT,
|
||||
check=False,
|
||||
@@ -67,7 +66,7 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
|
||||
"convert",
|
||||
str(pdf),
|
||||
"--out",
|
||||
str(sample_output),
|
||||
str(output_root),
|
||||
]
|
||||
),
|
||||
"exit_code": completed.returncode,
|
||||
@@ -77,34 +76,27 @@ def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -
|
||||
)
|
||||
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
|
||||
|
||||
markdown_path = sample_output / f"{pdf.stem}.md"
|
||||
metadata_path = sample_output / f"{pdf.stem}.metadata.json"
|
||||
report_path = sample_output / f"{pdf.stem}.report.md"
|
||||
sample_output = output_root / pdf.stem
|
||||
markdown_path = sample_output / f"{pdf.stem}_001.md"
|
||||
report_path = sample_output / f"{pdf.stem}_report.md"
|
||||
assert markdown_path.exists()
|
||||
assert metadata_path.exists()
|
||||
assert report_path.exists()
|
||||
assert not list(sample_output.glob("*.metadata.json"))
|
||||
|
||||
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
summary = metadata["summary"]
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert summary["pages_processed"] >= 1
|
||||
assert "warning_count" in summary
|
||||
assert "math_render_error_count" in summary
|
||||
assert "asset_count" in summary
|
||||
report = report_path.read_text(encoding="utf-8")
|
||||
assert "Output Markdown:" in report
|
||||
assert "Metadata JSON:" in report
|
||||
assert "Metadata JSON:" not in report
|
||||
assert "Report Markdown:" in report
|
||||
assert "- Engine: MinerU" in report
|
||||
assert "- Pages processed:" in report
|
||||
assert "- Warning count:" in report
|
||||
assert "- Math render error count:" in report
|
||||
assert "- Asset count:" in report
|
||||
attempts[-1].update(
|
||||
{
|
||||
"markdown_path": str(markdown_path),
|
||||
"metadata_path": str(metadata_path),
|
||||
"report_path": str(report_path),
|
||||
"warning_count": summary["warning_count"],
|
||||
"final_status": _report_final_status(report),
|
||||
"math_render_error_count": summary["math_render_error_count"],
|
||||
"asset_count": summary["asset_count"],
|
||||
"pages_processed": summary["pages_processed"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
@@ -68,8 +67,13 @@ def make_pdf(directory: Path, name: str) -> Path:
|
||||
return path
|
||||
|
||||
|
||||
def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_counts(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "쉘구조_math.pdf")
|
||||
def report_metadata(result) -> dict:
|
||||
assert result._report_metadata is not None
|
||||
return result._report_metadata
|
||||
|
||||
|
||||
def test_v1_fast_conversion_writes_markdown_report_assets_and_quality_counts(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "math.pdf")
|
||||
adapter = FixtureAdapter(
|
||||
raw_markdown=(
|
||||
"# Shell Element\n\n"
|
||||
@@ -85,17 +89,21 @@ def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_c
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert result.markdown_path == tmp_path / "out" / "math" / "math_001.md"
|
||||
assert result.markdown_path.exists()
|
||||
assert result.metadata_path is not None and result.metadata_path.exists()
|
||||
assert result.metadata_path is None
|
||||
assert not list((tmp_path / "out").rglob("*.metadata.json"))
|
||||
assert result.report_path == tmp_path / "out" / "math" / "math_report.md"
|
||||
assert result.report_path.exists()
|
||||
assert (tmp_path / "out" / "쉘구조_math.assets" / "mesh.png").read_bytes() == b"fake image"
|
||||
assert result.assets_dir == tmp_path / "out" / "math" / "images"
|
||||
assert (result.assets_dir / "mesh.png").read_bytes() == b"fake image"
|
||||
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "$u_i$" in markdown
|
||||
assert "$$\nK u = f\n$$" in markdown
|
||||
assert "" in markdown
|
||||
assert "" in markdown
|
||||
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["engine"] == "MinerU"
|
||||
assert metadata["engine_version"] == "3.1.0"
|
||||
assert metadata["summary"]["pages_processed"] == 3
|
||||
@@ -105,18 +113,18 @@ def test_v1_fast_conversion_writes_markdown_metadata_report_assets_and_quality_c
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["summary"]["warning_count"] == 1
|
||||
assert metadata["warnings"][0]["code"] == "TABLE_FALLBACK"
|
||||
assert metadata["assets"] == [{"relative_path": "쉘구조_math.assets/mesh.png"}]
|
||||
assert metadata["assets"] == [{"relative_path": "images/mesh.png"}]
|
||||
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Final status: `partial`" in report
|
||||
assert "- Output Markdown:" in report
|
||||
assert "- Metadata JSON:" in report
|
||||
assert "- Metadata JSON:" not in report
|
||||
assert "- Report Markdown:" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "`TABLE_FALLBACK`" in report
|
||||
|
||||
|
||||
def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_path: Path) -> None:
|
||||
def test_v1_fast_failure_records_no_fallback_and_writes_report_only(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path, "failed.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="", succeeded=False)
|
||||
|
||||
@@ -126,14 +134,15 @@ def test_v1_fast_failure_records_no_fallback_and_writes_no_release_outputs(tmp_p
|
||||
assert result.warning_count == 1
|
||||
assert result.warnings[0].code == WarningCode.MINERU_CLI_FAILED
|
||||
assert not result.markdown_path.exists()
|
||||
assert not result.report_path.exists()
|
||||
assert result.metadata_path is not None and not result.metadata_path.exists()
|
||||
assert result.report_path.exists()
|
||||
assert result.metadata_path is None
|
||||
assert "- Final status: `failed`" in result.report_path.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, capsys) -> None:
|
||||
source = tmp_path / "pdfs"
|
||||
first = make_pdf(source, "a.pdf")
|
||||
second = make_pdf(source, "한글.pdf")
|
||||
second = make_pdf(source, "korean.pdf")
|
||||
adapter = FixtureAdapter(raw_markdown="# Batch\n\nNo formulas.\n", raw_structured={"pages": 1})
|
||||
|
||||
exit_code = main(["convert", str(source), "--out", str(tmp_path / "out")], adapter=adapter, clock=fixed_clock)
|
||||
@@ -144,9 +153,8 @@ def test_v1_fast_cli_batch_summary_matches_generated_outputs(tmp_path: Path, cap
|
||||
assert "converted: 2" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (tmp_path / "out" / "a.md").exists()
|
||||
assert (tmp_path / "out" / "a.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "a.report.md").exists()
|
||||
assert (tmp_path / "out" / "한글.md").exists()
|
||||
assert (tmp_path / "out" / "한글.metadata.json").exists()
|
||||
assert (tmp_path / "out" / "한글.report.md").exists()
|
||||
assert (tmp_path / "out" / "a" / "a_001.md").exists()
|
||||
assert (tmp_path / "out" / "a" / "a_report.md").exists()
|
||||
assert (tmp_path / "out" / "korean" / "korean_001.md").exists()
|
||||
assert (tmp_path / "out" / "korean" / "korean_report.md").exists()
|
||||
assert not list((tmp_path / "out").rglob("*.metadata.json"))
|
||||
|
||||
+112
-16
@@ -2,6 +2,8 @@ from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from importlib.metadata import entry_points
|
||||
from pathlib import Path
|
||||
@@ -9,8 +11,10 @@ from pathlib import Path
|
||||
import pytest
|
||||
from pypdf import PdfWriter
|
||||
|
||||
import pdf2md.conversion as conversion_module
|
||||
from pdf2md.cli import main
|
||||
from pdf2md.doctor import DoctorCheck, DoctorReport
|
||||
from pdf2md.gpu import GpuInfo
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult
|
||||
|
||||
@@ -69,6 +73,24 @@ def make_pdf_with_pages(directory: Path, name: str, page_count: int) -> Path:
|
||||
return path
|
||||
|
||||
|
||||
def write_legacy_metadata(markdown_path: Path, source_pdf: Path) -> Path:
|
||||
metadata_path = markdown_path.with_suffix(".metadata.json")
|
||||
metadata = {
|
||||
"source_pdf": str(source_pdf.resolve()),
|
||||
"source_sha256": hashlib.sha256(source_pdf.read_bytes()).hexdigest(),
|
||||
"created_at": "2026-05-08T00:00:00Z",
|
||||
"engine": "MinerU",
|
||||
"engine_version": "3.1.0",
|
||||
"engine_options": {"strict_local": True},
|
||||
"pages": [{"page_index": 0, "blocks": []}],
|
||||
"assets": [],
|
||||
"warnings": [],
|
||||
"summary": {"pages_processed": 1, "warning_count": 0},
|
||||
}
|
||||
metadata_path.write_text(json.dumps(metadata, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
return metadata_path
|
||||
|
||||
|
||||
def test_console_script_entry_point_is_reserved() -> None:
|
||||
scripts = {entry_point.name: entry_point for entry_point in entry_points(group="console_scripts")}
|
||||
|
||||
@@ -128,18 +150,91 @@ def test_cli_convert_single_pdf_writes_outputs_and_summary(tmp_path: Path, capsy
|
||||
out = tmp_path / "out"
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out), "--metadata"], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "converted: 1" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (out / "paper.md").exists()
|
||||
assert (out / "paper.metadata.json").exists()
|
||||
assert (out / "paper.report.md").exists()
|
||||
assert (out / "paper" / "paper_001.md").exists()
|
||||
assert not list(out.rglob("*.metadata.json"))
|
||||
assert (out / "paper" / "paper_report.md").exists()
|
||||
assert adapter.calls == [pdf.resolve()]
|
||||
assert adapter.options[0].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
assert adapter.options[0].to_engine_options() == {
|
||||
"strict_local": True,
|
||||
"gpu_device": "cuda:0",
|
||||
"mineru_profile": {
|
||||
"requested": "auto",
|
||||
"applied": "safe",
|
||||
"environment": {
|
||||
"MINERU_API_MAX_CONCURRENT_REQUESTS": "1",
|
||||
"MINERU_PDF_RENDER_THREADS": "1",
|
||||
"MINERU_PROCESSING_WINDOW_SIZE": "1",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_cli_convert_accepts_safe_mineru_profile(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(
|
||||
["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "safe"],
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert adapter.options[0].to_engine_options()["mineru_profile"]["requested"] == "safe"
|
||||
|
||||
|
||||
def test_cli_convert_accepts_performance_mineru_profile(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(
|
||||
["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "performance"],
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
profile = adapter.options[0].to_engine_options()["mineru_profile"]
|
||||
assert profile["requested"] == "performance"
|
||||
assert profile["applied"] == "safe"
|
||||
|
||||
|
||||
def test_cli_convert_rejects_invalid_mineru_profile(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
|
||||
with pytest.raises(SystemExit) as error:
|
||||
main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "fast"])
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert error.value.code == 2
|
||||
assert "invalid choice" in captured.err
|
||||
|
||||
|
||||
def test_cli_convert_gpu_auto_selects_largest_visible_gpu(tmp_path: Path, capsys, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
adapter = FakeAdapter()
|
||||
inventory = (
|
||||
GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"),
|
||||
GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"),
|
||||
)
|
||||
monkeypatch.setattr(conversion_module, "query_nvidia_gpus", lambda: inventory)
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--gpu", "auto"], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
capsys.readouterr()
|
||||
options = adapter.options[0].to_engine_options()
|
||||
assert exit_code == 0
|
||||
assert options["gpu_device"] == "cuda:1"
|
||||
assert options["mineru_profile"]["selected_gpu"]["name"] == "NVIDIA RTX 4090"
|
||||
|
||||
|
||||
def test_cli_convert_directory_is_deterministic(tmp_path: Path, capsys) -> None:
|
||||
@@ -173,7 +268,7 @@ def test_cli_convert_recursive_only_when_requested(tmp_path: Path, capsys) -> No
|
||||
assert exit_code == 0
|
||||
assert [path.name for path in adapter.calls] == ["child.pdf", "top.pdf"]
|
||||
assert "converted: 2" in captured.out
|
||||
assert (tmp_path / "out" / "nested" / "child.md").exists()
|
||||
assert (tmp_path / "out" / "nested" / "child" / "child_001.md").exists()
|
||||
|
||||
|
||||
def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None:
|
||||
@@ -204,22 +299,23 @@ def test_cli_recheck_markdown_regenerates_adjacent_metadata_and_report(tmp_path:
|
||||
)
|
||||
capsys.readouterr()
|
||||
|
||||
markdown_path = out / "paper.md"
|
||||
markdown_path = out / "paper" / "paper_001.md"
|
||||
markdown_path.write_text("Inline $x_i$\n", encoding="utf-8")
|
||||
write_legacy_metadata(markdown_path, pdf)
|
||||
exit_code = main(["recheck", str(markdown_path)], clock=fixed_clock, math_checker=lambda _: True)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "rechecked:" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert "- Final status: `success`" in (out / "paper.report.md").read_text(encoding="utf-8")
|
||||
assert "- Final status: `success`" in markdown_path.with_suffix(".report.md").read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
out = tmp_path / "out"
|
||||
out.mkdir()
|
||||
(out / "paper.md").write_text("old", encoding="utf-8")
|
||||
(out / "paper").mkdir(parents=True)
|
||||
(out / "paper" / "paper_001.md").write_text("old", encoding="utf-8")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
||||
@@ -240,12 +336,12 @@ def test_cli_convert_chunk_pages_flag_uses_default_twenty_pages(tmp_path: Path,
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "converted: 2" in captured.out
|
||||
assert [path.name for path in adapter.calls] == [
|
||||
"long.part-001.pages-001-020.pdf",
|
||||
"long.part-002.pages-021-021.pdf",
|
||||
]
|
||||
assert (out / "long.part-001.pages-001-020.md").exists()
|
||||
assert (out / "long.part-002.pages-021-021.md").exists()
|
||||
assert len(adapter.calls) == 21
|
||||
assert [path.name for path in adapter.calls[:3]] == ["long.page-001.pdf", "long.page-002.pdf", "long.page-003.pdf"]
|
||||
assert (out / "long" / "long_001.md").exists()
|
||||
assert (out / "long" / "long_002.md").exists()
|
||||
assert (out / "long" / "long_report.md").exists()
|
||||
assert not list(out.rglob("*.metadata.json"))
|
||||
|
||||
|
||||
def test_cli_convert_rejects_non_positive_chunk_pages(tmp_path: Path, capsys) -> None:
|
||||
|
||||
+357
-55
@@ -6,10 +6,11 @@ from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pypdf import PdfWriter
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
import pdf2md.conversion as conversion_module
|
||||
from pdf2md.conversion import BatchConversionResult, convert_input, convert_pdf, recheck_markdown
|
||||
from pdf2md.gpu import GpuInfo
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult, StrictLocalViolationError
|
||||
from pdf2md.paths import OutputConflictError
|
||||
@@ -32,6 +33,7 @@ class FakeAdapter:
|
||||
self.warnings = warnings
|
||||
self.asset_name = asset_name
|
||||
self.calls: list[tuple[Path, Path, object]] = []
|
||||
self.input_page_counts: list[int] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
@@ -39,6 +41,10 @@ class FakeAdapter:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
(output_dir / "raw.log").write_text("raw output", encoding="utf-8")
|
||||
self.calls.append((input_path, output_dir, options))
|
||||
try:
|
||||
self.input_page_counts.append(len(PdfReader(input_path).pages))
|
||||
except Exception:
|
||||
self.input_page_counts.append(0)
|
||||
asset_paths: tuple[Path, ...] = ()
|
||||
if self.asset_name is not None:
|
||||
asset_path = output_dir / "assets" / self.asset_name
|
||||
@@ -67,12 +73,17 @@ class SequencedAdapter:
|
||||
def __init__(self, outcomes: tuple[bool, ...]) -> None:
|
||||
self.outcomes = list(outcomes)
|
||||
self.calls: list[Path] = []
|
||||
self.input_page_counts: list[int] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append(input_path)
|
||||
try:
|
||||
self.input_page_counts.append(len(PdfReader(input_path).pages))
|
||||
except Exception:
|
||||
self.input_page_counts.append(0)
|
||||
succeeded = self.outcomes.pop(0)
|
||||
warning = WarningRecord(WarningCode.MINERU_CLI_FAILED, WarningSeverity.ERROR, "MinerU failed.")
|
||||
return MinerUAdapterResult(
|
||||
@@ -93,6 +104,66 @@ class SequencedAdapter:
|
||||
)
|
||||
|
||||
|
||||
class PageMarkdownAdapter:
|
||||
def __init__(self, markdown_pages: tuple[str, ...]) -> None:
|
||||
self.markdown_pages = list(markdown_pages)
|
||||
self.calls: list[Path] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append(input_path)
|
||||
markdown = self.markdown_pages.pop(0)
|
||||
return MinerUAdapterResult(
|
||||
succeeded=True,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=markdown,
|
||||
raw_structured={"pages": 1},
|
||||
asset_paths=(),
|
||||
warnings=(),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
class CollidingPageAssetAdapter:
|
||||
def __init__(self) -> None:
|
||||
self.calls: list[Path] = []
|
||||
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
output_dir = Path(work_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.calls.append(input_path)
|
||||
page_number = len(self.calls)
|
||||
asset_path = output_dir / "assets" / "fig.png"
|
||||
asset_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
asset_path.write_bytes(f"asset {page_number}".encode("utf-8"))
|
||||
return MinerUAdapterResult(
|
||||
succeeded=True,
|
||||
command=("mineru", "-p", str(input_path), "-o", str(output_dir)),
|
||||
input_pdf=input_path,
|
||||
work_dir=output_dir,
|
||||
raw_markdown=f"Page {page_number}\n\n\n",
|
||||
raw_structured={"pages": 1},
|
||||
asset_paths=(asset_path,),
|
||||
warnings=(),
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
engine_options=options.to_engine_options() if options is not None else {"strict_local": True},
|
||||
exit_code=0,
|
||||
stdout="",
|
||||
stderr="",
|
||||
)
|
||||
|
||||
|
||||
class NestedMinerUAssetAdapter:
|
||||
def convert(self, input_pdf, work_dir, options=None) -> MinerUAdapterResult:
|
||||
input_path = Path(input_pdf)
|
||||
@@ -140,6 +211,20 @@ def make_pdf_with_pages(tmp_path: Path, page_count: int, name: str = "paper.pdf"
|
||||
return path
|
||||
|
||||
|
||||
def report_metadata(result) -> dict:
|
||||
assert result._report_metadata is not None
|
||||
return result._report_metadata
|
||||
|
||||
|
||||
def write_legacy_metadata(result) -> Path:
|
||||
metadata_path = result.markdown_path.with_suffix(".metadata.json")
|
||||
metadata_path.write_text(
|
||||
json.dumps(report_metadata(result), indent=2, ensure_ascii=False, sort_keys=True) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
return metadata_path
|
||||
|
||||
|
||||
def test_convert_pdf_writes_markdown_metadata_report_and_assets(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(
|
||||
@@ -156,18 +241,23 @@ def test_convert_pdf_writes_markdown_metadata_report_and_assets(tmp_path: Path)
|
||||
assert result.warning_count == 0
|
||||
assert result.engine == "MinerU"
|
||||
assert result.engine_version == "3.1.0"
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "# Title\n\nInline $x_i$\n\n\n"
|
||||
assert (tmp_path / "out" / "paper.assets" / "fig.png").read_bytes() == b"asset"
|
||||
assert result.markdown_path == tmp_path / "out" / "paper" / "paper_001.md"
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "# Title\n\nInline $x_i$\n\n\n"
|
||||
assert (tmp_path / "out" / "paper" / "images" / "fig.png").read_bytes() == b"asset"
|
||||
assert result.metadata_path is None
|
||||
assert not list((tmp_path / "out").rglob("*.metadata.json"))
|
||||
assert result.report_path.exists()
|
||||
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
|
||||
assert metadata["created_at"] == "2026-05-08T00:00:00Z"
|
||||
assert metadata["summary"]["pages_processed"] == 2
|
||||
assert metadata["summary"]["inline_formula_count"] == 1
|
||||
assert metadata["summary"]["asset_count"] == 1
|
||||
assert metadata["assets"] == [{"relative_path": "paper.assets/fig.png"}]
|
||||
assert "- Final status: `success`" in result.report_path.read_text(encoding="utf-8")
|
||||
assert metadata["assets"] == [{"relative_path": "images/fig.png"}]
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "- Final status: `success`" in report
|
||||
assert "Metadata JSON:" not in report
|
||||
assert not adapter.calls[0][1].exists()
|
||||
|
||||
|
||||
@@ -183,14 +273,16 @@ def test_convert_pdf_adapter_failure_returns_failed_result_without_fallback_or_o
|
||||
assert result.warnings == (warning,)
|
||||
assert len(adapter.calls) == 1
|
||||
assert not result.markdown_path.exists()
|
||||
assert not result.report_path.exists()
|
||||
assert result.metadata_path is None
|
||||
assert result.report_path.exists()
|
||||
assert "- Final status: `failed`" in result.report_path.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_convert_pdf_respects_output_conflicts_and_overwrite(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
out = tmp_path / "out"
|
||||
out.mkdir()
|
||||
(out / "paper.md").write_text("old", encoding="utf-8")
|
||||
(out / "paper").mkdir(parents=True)
|
||||
(out / "paper" / "paper_001.md").write_text("old", encoding="utf-8")
|
||||
|
||||
with pytest.raises(OutputConflictError):
|
||||
convert_pdf(pdf, out, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
@@ -209,7 +301,7 @@ def test_convert_pdf_can_skip_metadata_json_but_still_writes_report(tmp_path: Pa
|
||||
assert result.metadata_path is None
|
||||
assert result.markdown_path.exists()
|
||||
assert result.report_path.exists()
|
||||
assert not (tmp_path / "out" / "paper.metadata.json").exists()
|
||||
assert not list((tmp_path / "out").rglob("*.metadata.json"))
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "Metadata JSON:" not in report
|
||||
assert "Report Markdown:" in report
|
||||
@@ -223,7 +315,7 @@ def test_convert_pdf_records_math_checker_failures_in_metadata_and_report(tmp_pa
|
||||
|
||||
assert result.final_status == "partial"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_FAILED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["summary"]["math_render_error_count"] == 1
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_FAILED"
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
@@ -244,7 +336,7 @@ def test_convert_pdf_repairs_math_render_failure_before_writing_outputs(tmp_path
|
||||
assert result.final_status == "partial"
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$"
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
@@ -256,6 +348,7 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(bad_math\\)\n")
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: False, clock=fixed_clock)
|
||||
legacy_metadata_path = write_legacy_metadata(result)
|
||||
|
||||
result.markdown_path.write_text("Inline $x_i$\n", encoding="utf-8")
|
||||
rechecked = recheck_markdown(result.markdown_path, math_checker=lambda _: True, clock=fixed_clock)
|
||||
@@ -263,9 +356,9 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(
|
||||
assert rechecked.final_status == "success"
|
||||
assert rechecked.warning_count == 0
|
||||
assert rechecked.markdown_path == result.markdown_path
|
||||
assert rechecked.metadata_path == result.metadata_path
|
||||
assert rechecked.report_path == result.report_path
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
assert rechecked.metadata_path == legacy_metadata_path
|
||||
assert rechecked.report_path == result.markdown_path.with_suffix(".report.md")
|
||||
metadata = json.loads(legacy_metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
|
||||
assert metadata["created_at"] == "2026-05-08T00:00:00Z"
|
||||
assert metadata["summary"]["pages_processed"] == 1
|
||||
@@ -273,7 +366,7 @@ def test_recheck_markdown_regenerates_metadata_and_report_from_current_markdown(
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["summary"]["warning_count"] == 0
|
||||
assert metadata["warnings"] == []
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
report = rechecked.report_path.read_text(encoding="utf-8")
|
||||
assert "- Final status: `success`" in report
|
||||
assert "- Math render error count: 0" in report
|
||||
assert "- None" in report
|
||||
@@ -287,17 +380,26 @@ def test_recheck_markdown_repairs_math_render_failure(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="No formulas.\n")
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
legacy_metadata_path = write_legacy_metadata(result)
|
||||
result.markdown_path.write_text("$$\nx ^ {i} ^ {t}\n$$\n", encoding="utf-8")
|
||||
|
||||
rechecked = recheck_markdown(result.markdown_path, math_checker=RepairAwareChecker(), clock=fixed_clock)
|
||||
|
||||
assert rechecked.markdown_path.read_text(encoding="utf-8") == "$$\nx ^ {i} {} ^ {t}\n$$\n"
|
||||
assert [warning.code for warning in rechecked.warnings] == [WarningCode.MATH_RENDER_REPAIRED]
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
metadata = json.loads(legacy_metadata_path.read_text(encoding="utf-8"))
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
assert metadata["warnings"][0]["code"] == "MATH_RENDER_REPAIRED"
|
||||
|
||||
|
||||
def test_recheck_markdown_requires_legacy_metadata_for_simplified_outputs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=FakeAdapter(), math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
with pytest.raises(ValueError, match="Legacy adjacent metadata JSON"):
|
||||
recheck_markdown(result.markdown_path, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
|
||||
def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="Inline \\(x\\)\n")
|
||||
@@ -308,7 +410,7 @@ def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path:
|
||||
assert result.final_status == "partial"
|
||||
assert result.warnings[0].code == WarningCode.MATH_RENDER_FAILED
|
||||
assert result.warnings[0].severity == WarningSeverity.INFO
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["summary"]["warning_count"] == 1
|
||||
assert metadata["summary"]["math_render_error_count"] == 0
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
@@ -316,6 +418,55 @@ def test_convert_pdf_records_unavailable_math_checker_for_math_output(tmp_path:
|
||||
assert "- Math render error count: 0" in report
|
||||
|
||||
|
||||
def test_convert_pdf_records_text_fidelity_without_replacing_markdown(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter(raw_markdown="쉘의 력과 曲률\n", raw_structured={"pages": 1})
|
||||
monkeypatch.setattr(
|
||||
"pdf2md.text_fidelity.extract_pdf_text_pages",
|
||||
lambda _: ("쉘의 응력과 곡률\n",),
|
||||
)
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", adapter=adapter, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert result.markdown_path.read_text(encoding="utf-8") == "쉘의 력과 曲률\n"
|
||||
assert [warning.code for warning in result.warnings] == [
|
||||
WarningCode.TEXT_LAYER_AVAILABLE,
|
||||
WarningCode.TEXT_FIDELITY_LOW,
|
||||
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
|
||||
]
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["text_fidelity"][0]["replacement_candidate"] is True
|
||||
assert metadata["summary"]["text_fidelity_low_page_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_unexpected_cjk_count"] == 1
|
||||
report = result.report_path.read_text(encoding="utf-8")
|
||||
assert "## Text Fidelity" in report
|
||||
assert "`TEXT_FIDELITY_LOW` page 0" in report
|
||||
|
||||
|
||||
def test_recheck_markdown_reruns_text_fidelity_without_duplicate_old_warnings(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
monkeypatch.setattr(
|
||||
"pdf2md.text_fidelity.extract_pdf_text_pages",
|
||||
lambda _: ("쉘의 응력과 곡률\n",),
|
||||
)
|
||||
result = convert_pdf(
|
||||
pdf,
|
||||
tmp_path / "out",
|
||||
adapter=FakeAdapter(raw_markdown="쉘의 력과 曲률\n", raw_structured={"pages": 1}),
|
||||
math_checker=lambda _: True,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
result.markdown_path.write_text("쉘의 응력과 곡률\n", encoding="utf-8")
|
||||
legacy_metadata_path = write_legacy_metadata(result)
|
||||
rechecked = recheck_markdown(result.markdown_path, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
assert [warning.code for warning in rechecked.warnings] == [WarningCode.TEXT_LAYER_AVAILABLE]
|
||||
metadata = json.loads(legacy_metadata_path.read_text(encoding="utf-8"))
|
||||
assert [warning["code"] for warning in metadata["warnings"]] == ["TEXT_LAYER_AVAILABLE"]
|
||||
assert metadata["summary"]["text_fidelity_low_page_count"] == 0
|
||||
|
||||
|
||||
def test_convert_pdf_uses_default_math_checker_when_available(tmp_path: Path, monkeypatch) -> None:
|
||||
class DefaultChecker:
|
||||
def __init__(self) -> None:
|
||||
@@ -342,7 +493,7 @@ def test_convert_pdf_keep_raw_preserves_adapter_work_directory(tmp_path: Path) -
|
||||
|
||||
result = convert_pdf(pdf, tmp_path / "out", keep_raw=True, adapter=FakeAdapter(), clock=fixed_clock)
|
||||
|
||||
assert result.raw_dir == tmp_path / "out" / "paper.raw"
|
||||
assert result.raw_dir == tmp_path / "out" / "paper" / "raw"
|
||||
assert (result.raw_dir / "raw.log").read_text(encoding="utf-8") == "raw output"
|
||||
|
||||
|
||||
@@ -359,7 +510,10 @@ def test_convert_pdf_passes_gpu_device_to_strict_local_options(tmp_path: Path) -
|
||||
|
||||
convert_pdf(pdf, tmp_path / "out", gpu="cuda:0", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
engine_options = adapter.calls[0][2].to_engine_options()
|
||||
assert engine_options["strict_local"] is True
|
||||
assert engine_options["gpu_device"] == "cuda:0"
|
||||
assert engine_options["mineru_profile"]["requested"] == "auto"
|
||||
|
||||
|
||||
def test_convert_pdf_defaults_to_cuda_zero(tmp_path: Path) -> None:
|
||||
@@ -368,7 +522,58 @@ def test_convert_pdf_defaults_to_cuda_zero(tmp_path: Path) -> None:
|
||||
|
||||
convert_pdf(pdf, tmp_path / "out", adapter=adapter, clock=fixed_clock)
|
||||
|
||||
assert adapter.calls[0][2].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
engine_options = adapter.calls[0][2].to_engine_options()
|
||||
assert engine_options["strict_local"] is True
|
||||
assert engine_options["gpu_device"] == "cuda:0"
|
||||
assert engine_options["mineru_profile"]["requested"] == "auto"
|
||||
|
||||
|
||||
def test_convert_pdf_gpu_auto_selects_largest_gpu_and_records_profile(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter()
|
||||
inventory = (
|
||||
GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"),
|
||||
GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"),
|
||||
)
|
||||
|
||||
result = convert_pdf(
|
||||
pdf,
|
||||
tmp_path / "out",
|
||||
gpu="auto",
|
||||
mineru_profile="auto",
|
||||
gpu_inventory=inventory,
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
engine_options = adapter.calls[0][2].to_engine_options()
|
||||
assert engine_options["gpu_device"] == "cuda:1"
|
||||
assert engine_options["mineru_profile"]["applied"] == "auto"
|
||||
assert engine_options["mineru_profile"]["selected_gpu"]["index"] == 1
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["engine_options"]["gpu_device"] == "cuda:1"
|
||||
assert metadata["engine_options"]["mineru_profile"]["selected_gpu"]["name"] == "NVIDIA RTX 4090"
|
||||
|
||||
|
||||
def test_convert_pdf_performance_profile_warning_is_recorded(tmp_path: Path) -> None:
|
||||
pdf = make_pdf(tmp_path)
|
||||
adapter = FakeAdapter()
|
||||
inventory = (GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00"),)
|
||||
|
||||
result = convert_pdf(
|
||||
pdf,
|
||||
tmp_path / "out",
|
||||
gpu="cuda:0",
|
||||
mineru_profile="performance",
|
||||
gpu_inventory=inventory,
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_PROFILE_ADJUSTED]
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["warnings"][0]["code"] == "MINERU_PROFILE_ADJUSTED"
|
||||
assert metadata["engine_options"]["mineru_profile"]["applied"] == "safe"
|
||||
|
||||
|
||||
def test_convert_pdf_rewrites_nested_mineru_image_links_and_page_indexes(tmp_path: Path) -> None:
|
||||
@@ -385,11 +590,10 @@ def test_convert_pdf_rewrites_nested_mineru_image_links_and_page_indexes(tmp_pat
|
||||
assert result.final_status == "success"
|
||||
assert result.pages_processed == 13
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "" in markdown
|
||||
assert "](images/fig.png)" not in markdown
|
||||
copied_asset = tmp_path / "out" / "paper.assets" / "paper" / "hybrid_auto" / "images" / "fig.png"
|
||||
assert "" in markdown
|
||||
copied_asset = tmp_path / "out" / "paper" / "images" / "fig.png"
|
||||
assert copied_asset.read_bytes() == b"nested asset"
|
||||
metadata = json.loads(result.metadata_path.read_text(encoding="utf-8"))
|
||||
metadata = report_metadata(result)
|
||||
assert metadata["summary"]["pages_processed"] == 13
|
||||
assert metadata["summary"]["warning_count"] == 0
|
||||
|
||||
@@ -406,12 +610,13 @@ def test_convert_input_batch_continues_after_per_file_failure(tmp_path: Path) ->
|
||||
assert [path.name for path in adapter.calls] == ["a.pdf", "b.pdf", "c.pdf"]
|
||||
assert batch.converted_count == 2
|
||||
assert batch.failed_count == 1
|
||||
assert (tmp_path / "out" / "a.md").exists()
|
||||
assert not (tmp_path / "out" / "b.md").exists()
|
||||
assert (tmp_path / "out" / "c.md").exists()
|
||||
assert (tmp_path / "out" / "a" / "a_001.md").exists()
|
||||
assert not (tmp_path / "out" / "b" / "b_001.md").exists()
|
||||
assert (tmp_path / "out" / "b" / "b_report.md").exists()
|
||||
assert (tmp_path / "out" / "c" / "c_001.md").exists()
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_returns_batch_and_deletes_temporary_chunk_pdfs(tmp_path: Path) -> None:
|
||||
def test_convert_pdf_chunk_mode_converts_single_pages_and_returns_grouped_outputs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 41, "thesis.pdf")
|
||||
adapter = FakeAdapter(raw_structured={"pages": 1})
|
||||
|
||||
@@ -427,60 +632,157 @@ def test_convert_pdf_chunk_mode_returns_batch_and_deletes_temporary_chunk_pdfs(t
|
||||
assert isinstance(batch, BatchConversionResult)
|
||||
assert batch.converted_count == 3
|
||||
assert [result.markdown_path.name for result in batch.results] == [
|
||||
"thesis.part-001.pages-001-020.md",
|
||||
"thesis.part-002.pages-021-040.md",
|
||||
"thesis.part-003.pages-041-041.md",
|
||||
"thesis_001.md",
|
||||
"thesis_002.md",
|
||||
"thesis_003.md",
|
||||
]
|
||||
assert [path.name for path, _, _ in adapter.calls] == [
|
||||
"thesis.part-001.pages-001-020.pdf",
|
||||
"thesis.part-002.pages-021-040.pdf",
|
||||
"thesis.part-003.pages-041-041.pdf",
|
||||
assert len(adapter.calls) == 41
|
||||
assert adapter.input_page_counts == [1] * 41
|
||||
assert [path.name for path, _, _ in adapter.calls[:3]] == [
|
||||
"thesis.page-001.pdf",
|
||||
"thesis.page-002.pdf",
|
||||
"thesis.page-003.pdf",
|
||||
]
|
||||
assert all(result.source_pdf == pdf.resolve() for result in batch.results)
|
||||
assert all(not path.exists() for path, _, _ in adapter.calls)
|
||||
|
||||
metadata = json.loads((tmp_path / "out" / "thesis.part-002.pages-021-040.metadata.json").read_text(encoding="utf-8"))
|
||||
assert all(result.metadata_path is None for result in batch.results)
|
||||
assert not list((tmp_path / "out").rglob("*.metadata.json"))
|
||||
assert {result.report_path for result in batch.results} == {tmp_path / "out" / "thesis" / "thesis_report.md"}
|
||||
|
||||
metadata = report_metadata(batch.results[1])
|
||||
assert metadata["source_pdf"] == str(pdf.resolve())
|
||||
assert metadata["source_sha256"] == hashlib.sha256(pdf.read_bytes()).hexdigest()
|
||||
assert metadata["engine_options"]["chunk"] == {
|
||||
"chunk_index": 2,
|
||||
"chunk_page_count": 20,
|
||||
"chunk_pdf_name": "thesis.part-002.pages-021-040.pdf",
|
||||
"original_source_pdf": str(pdf.resolve()),
|
||||
"source_page_end": 40,
|
||||
"source_page_start": 21,
|
||||
"total_chunks": 3,
|
||||
}
|
||||
report = (tmp_path / "out" / "thesis.part-002.pages-021-040.report.md").read_text(encoding="utf-8")
|
||||
assert "- Chunk: 2/3, source pages: 21-40" in report
|
||||
assert metadata["engine_options"]["page_conversion"] == {
|
||||
"failed_source_pages": [],
|
||||
"mineru_input_page_count": 1,
|
||||
"mode": "single_page",
|
||||
"output_group_page_count": 20,
|
||||
}
|
||||
report = (tmp_path / "out" / "thesis" / "thesis_report.md").read_text(encoding="utf-8")
|
||||
assert "- Markdown part 2/3:" in report
|
||||
assert "source pages 21-40" in report
|
||||
assert "thesis_002.md" in report
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_keeps_short_pdf_as_single_batch_result(tmp_path: Path) -> None:
|
||||
def test_convert_pdf_chunk_mode_converts_short_pdf_as_single_page_inputs(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 3, "short.pdf")
|
||||
adapter = FakeAdapter(raw_structured={"pages": 3})
|
||||
adapter = FakeAdapter(raw_structured={"pages": 1})
|
||||
|
||||
batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
|
||||
|
||||
assert isinstance(batch, BatchConversionResult)
|
||||
assert batch.converted_count == 1
|
||||
assert batch.results[0].markdown_path.name == "short.md"
|
||||
assert adapter.calls[0][0] == pdf.resolve()
|
||||
assert adapter.calls[0][0].exists()
|
||||
assert batch.results[0].markdown_path.name == "short_001.md"
|
||||
assert [path.name for path, _, _ in adapter.calls] == [
|
||||
"short.page-001.pdf",
|
||||
"short.page-002.pdf",
|
||||
"short.page-003.pdf",
|
||||
]
|
||||
assert adapter.input_page_counts == [1, 1, 1]
|
||||
assert all(not path.exists() for path, _, _ in adapter.calls)
|
||||
metadata = report_metadata(batch.results[0])
|
||||
assert metadata["engine_options"]["chunk"]["chunk_page_count"] == 3
|
||||
assert metadata["engine_options"]["page_conversion"]["output_group_page_count"] == 20
|
||||
|
||||
|
||||
def test_convert_input_chunk_mode_continues_after_failed_chunk(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 41, "paper.pdf")
|
||||
def test_convert_input_chunk_mode_continues_after_failed_page_inside_group(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 3, "paper.pdf")
|
||||
adapter = SequencedAdapter((True, False, True))
|
||||
|
||||
batch = convert_input(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
|
||||
|
||||
assert batch.converted_count == 2
|
||||
assert batch.failed_count == 1
|
||||
assert batch.converted_count == 1
|
||||
assert batch.failed_count == 0
|
||||
assert [path.name for path in adapter.calls] == [
|
||||
"paper.part-001.pages-001-020.pdf",
|
||||
"paper.part-002.pages-021-040.pdf",
|
||||
"paper.part-003.pages-041-041.pdf",
|
||||
"paper.page-001.pdf",
|
||||
"paper.page-002.pdf",
|
||||
"paper.page-003.pdf",
|
||||
]
|
||||
assert (tmp_path / "out" / "paper.part-001.pages-001-020.md").exists()
|
||||
assert not (tmp_path / "out" / "paper.part-002.pages-021-040.md").exists()
|
||||
assert (tmp_path / "out" / "paper.part-003.pages-041-041.md").exists()
|
||||
assert adapter.input_page_counts == [1, 1, 1]
|
||||
assert (tmp_path / "out" / "paper" / "paper_001.md").exists()
|
||||
markdown = (tmp_path / "out" / "paper" / "paper_001.md").read_text(encoding="utf-8")
|
||||
assert "<!-- source-page: 2 conversion failed; see report -->" in markdown
|
||||
metadata = report_metadata(batch.results[0])
|
||||
assert metadata["summary"]["pages_processed"] == 3
|
||||
assert metadata["warnings"][0]["code"] == "MINERU_CLI_FAILED"
|
||||
assert metadata["warnings"][0]["severity"] == "warning"
|
||||
assert metadata["warnings"][0]["page_index"] == 1
|
||||
assert metadata["engine_options"]["page_conversion"]["failed_source_pages"] == [2]
|
||||
assert "- Final status: `partial`" in (tmp_path / "out" / "paper" / "paper_report.md").read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_failed_group_writes_report_but_no_markdown(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 2, "paper.pdf")
|
||||
adapter = SequencedAdapter((False, False))
|
||||
|
||||
batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, clock=fixed_clock)
|
||||
|
||||
assert batch.converted_count == 0
|
||||
assert batch.failed_count == 1
|
||||
[result] = batch.results
|
||||
assert result.final_status == "failed"
|
||||
assert not result.markdown_path.exists()
|
||||
assert result.metadata_path is None
|
||||
assert not list((tmp_path / "out").rglob("*.metadata.json"))
|
||||
assert result.report_path.exists()
|
||||
metadata = report_metadata(result)
|
||||
assert [warning["page_index"] for warning in metadata["warnings"]] == [0, 1]
|
||||
assert {warning["severity"] for warning in metadata["warnings"]} == {"error"}
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_copies_page_assets_without_collisions(tmp_path: Path) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 2, "paper.pdf")
|
||||
adapter = CollidingPageAssetAdapter()
|
||||
|
||||
batch = convert_pdf(pdf, tmp_path / "out", adapter=adapter, chunk_pages=20, math_checker=lambda _: True, clock=fixed_clock)
|
||||
|
||||
[result] = batch.results
|
||||
markdown = result.markdown_path.read_text(encoding="utf-8")
|
||||
assert "" in markdown
|
||||
assert "" in markdown
|
||||
assert (result.assets_dir / "page-001_fig.png").read_bytes() == b"asset 1"
|
||||
assert (result.assets_dir / "page-002_fig.png").read_bytes() == b"asset 2"
|
||||
metadata = report_metadata(result)
|
||||
assert [asset["relative_path"] for asset in metadata["assets"]] == [
|
||||
"images/page-001_fig.png",
|
||||
"images/page-002_fig.png",
|
||||
]
|
||||
|
||||
|
||||
def test_convert_pdf_chunk_mode_preserves_page_text_fidelity_numbers(tmp_path: Path, monkeypatch) -> None:
|
||||
pdf = make_pdf_with_pages(tmp_path, 3, "korean.pdf")
|
||||
extraction_calls: list[Path] = []
|
||||
|
||||
def fake_extract(source_pdf: Path) -> tuple[str, ...]:
|
||||
extraction_calls.append(source_pdf)
|
||||
return ("가나다", "라마바", "사아자")
|
||||
|
||||
monkeypatch.setattr(conversion_module, "extract_pdf_text_pages", fake_extract)
|
||||
adapter = PageMarkdownAdapter(("가나다\n", "라마\n", "사아자\n"))
|
||||
|
||||
batch = convert_pdf(
|
||||
pdf,
|
||||
tmp_path / "out",
|
||||
adapter=adapter,
|
||||
chunk_pages=20,
|
||||
math_checker=lambda _: True,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
[result] = batch.results
|
||||
metadata = report_metadata(result)
|
||||
assert [record["page_index"] for record in metadata["text_fidelity"]] == [0, 1, 2]
|
||||
assert [record["source_page_number"] for record in metadata["text_fidelity"]] == [1, 2, 3]
|
||||
assert metadata["summary"]["text_fidelity_checked_page_count"] == 3
|
||||
assert extraction_calls == [pdf.resolve()]
|
||||
|
||||
+21
-5
@@ -165,7 +165,7 @@ def test_doctor_warns_when_gpu_and_pytorch_are_missing(tmp_path: Path) -> None:
|
||||
|
||||
|
||||
def test_doctor_warns_for_gtx_1070_ti_pascal_risk(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
|
||||
report = make_report(tmp_path, gpu_stdout="0, NVIDIA GeForce GTX 1070 Ti, 8192, 551.86\n")
|
||||
|
||||
gpu_check = find_check(report, "gpu")
|
||||
assert report.status == "warn"
|
||||
@@ -181,7 +181,7 @@ def test_doctor_warns_for_pytorch_pre_turing_capability(tmp_path: Path) -> None:
|
||||
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
gpu_stdout="NVIDIA RTX 4060, 8192 MiB, 551.86\n",
|
||||
gpu_stdout="0, NVIDIA RTX 4060, 8192, 551.86\n",
|
||||
import_module=fake_pascal_torch,
|
||||
)
|
||||
|
||||
@@ -220,7 +220,7 @@ def test_doctor_warns_when_mathjax_health_fails(tmp_path: Path) -> None:
|
||||
def failing_runner(command: tuple[str, ...]) -> DoctorCommandResult:
|
||||
if command[-1] == "--health":
|
||||
return DoctorCommandResult(command, 1, stderr="Cannot find package 'mathjax'")
|
||||
return command_runner("NVIDIA RTX 4060, 8192 MiB, 551.86\n")(command)
|
||||
return command_runner("0, NVIDIA RTX 4060, 8192, 551.86\n")(command)
|
||||
|
||||
report = make_report(tmp_path, run_command=failing_runner)
|
||||
|
||||
@@ -232,7 +232,7 @@ def test_doctor_warns_when_mathjax_health_fails(tmp_path: Path) -> None:
|
||||
|
||||
|
||||
def test_format_doctor_report_is_stable(tmp_path: Path) -> None:
|
||||
report = make_report(tmp_path, gpu_stdout="NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 551.86\n")
|
||||
report = make_report(tmp_path, gpu_stdout="0, NVIDIA GeForce GTX 1070 Ti, 8192, 551.86\n")
|
||||
|
||||
formatted = format_doctor_report(report)
|
||||
|
||||
@@ -241,13 +241,29 @@ def test_format_doctor_report_is_stable(tmp_path: Path) -> None:
|
||||
assert "[PASS] local-only:" in formatted
|
||||
|
||||
|
||||
def test_doctor_reports_auto_gpu_and_recommended_profile(tmp_path: Path) -> None:
|
||||
report = make_report(
|
||||
tmp_path,
|
||||
gpu_stdout=(
|
||||
"0, NVIDIA RTX 4060, 8192, 577.00\n"
|
||||
"1, NVIDIA RTX 4090, 24564, 577.00\n"
|
||||
),
|
||||
)
|
||||
|
||||
gpu_check = find_check(report, "gpu")
|
||||
assert gpu_check.status == "pass"
|
||||
assert any("gpu 1: NVIDIA RTX 4090, 24564 MiB, driver 577.00" in detail for detail in gpu_check.details)
|
||||
assert any("auto gpu: cuda:1" in detail for detail in gpu_check.details)
|
||||
assert any("recommended MinerU profile: auto" in detail for detail in gpu_check.details)
|
||||
|
||||
|
||||
def make_report(
|
||||
tmp_path: Path,
|
||||
*,
|
||||
python_version: tuple[int, int, int] = (3, 12, 7),
|
||||
available_tools: dict[str, str] | None = None,
|
||||
mineru_result: MinerUVersionResult | None = None,
|
||||
gpu_stdout: str = "NVIDIA RTX 4060, 8192 MiB, 551.86\n",
|
||||
gpu_stdout: str = "0, NVIDIA RTX 4060, 8192, 551.86\n",
|
||||
env: dict[str, str] | None = None,
|
||||
existing_paths: set[Path] | None = None,
|
||||
import_module=None,
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.gpu import GpuInfo, parse_nvidia_smi_gpus, select_gpu
|
||||
|
||||
|
||||
def test_parse_nvidia_smi_output_with_one_rtx_gpu() -> None:
|
||||
gpus = parse_nvidia_smi_gpus("0, NVIDIA GeForce RTX 4090, 24564, 577.00\n")
|
||||
|
||||
assert gpus == (
|
||||
GpuInfo(index=0, name="NVIDIA GeForce RTX 4090", memory_total_mib=24564, driver_version="577.00"),
|
||||
)
|
||||
assert gpus[0].pre_turing_risk is False
|
||||
|
||||
|
||||
def test_parse_nvidia_smi_output_with_multiple_gpus_and_mib_suffix() -> None:
|
||||
gpus = parse_nvidia_smi_gpus(
|
||||
"0, NVIDIA GeForce GTX 1070 Ti, 8192 MiB, 577.00\n"
|
||||
"1, NVIDIA RTX A5000, 24564 MiB, 577.00\n"
|
||||
)
|
||||
|
||||
assert [gpu.index for gpu in gpus] == [0, 1]
|
||||
assert [gpu.memory_total_mib for gpu in gpus] == [8192, 24564]
|
||||
assert gpus[0].pre_turing_risk is True
|
||||
assert gpus[1].pre_turing_risk is False
|
||||
|
||||
|
||||
def test_parse_nvidia_smi_output_ignores_blank_lines_and_rejects_malformed_memory() -> None:
|
||||
with pytest.raises(ValueError, match="memory"):
|
||||
parse_nvidia_smi_gpus("\n0, NVIDIA RTX 4090, not-memory, 577.00\n")
|
||||
|
||||
|
||||
def test_select_gpu_auto_chooses_largest_vram_gpu() -> None:
|
||||
gpus = (
|
||||
GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"),
|
||||
GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"),
|
||||
)
|
||||
|
||||
selected = select_gpu(gpus, "auto")
|
||||
|
||||
assert selected.gpu == gpus[1]
|
||||
assert selected.cuda_device == "cuda:1"
|
||||
|
||||
|
||||
def test_select_gpu_accepts_cuda_and_numeric_requests() -> None:
|
||||
gpus = (
|
||||
GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"),
|
||||
GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"),
|
||||
)
|
||||
|
||||
assert select_gpu(gpus, "cuda:1").gpu == gpus[1]
|
||||
assert select_gpu(gpus, "1").cuda_device == "cuda:1"
|
||||
|
||||
|
||||
def test_select_gpu_errors_when_requested_gpu_is_absent() -> None:
|
||||
gpus = (GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"),)
|
||||
|
||||
with pytest.raises(ValueError, match="not visible"):
|
||||
select_gpu(gpus, "cuda:1")
|
||||
|
||||
|
||||
def test_select_gpu_auto_errors_without_visible_gpus() -> None:
|
||||
with pytest.raises(ValueError, match="No visible NVIDIA GPU"):
|
||||
select_gpu((), "auto")
|
||||
@@ -11,6 +11,7 @@ from pdf2md.ir import (
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
TextFidelityRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
@@ -171,3 +172,53 @@ def test_info_math_render_warning_is_not_counted_as_render_error(tmp_path: Path)
|
||||
|
||||
assert summary["warning_count"] == 1
|
||||
assert summary["math_render_error_count"] == 0
|
||||
|
||||
|
||||
def test_metadata_includes_text_fidelity_when_records_exist(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
|
||||
text_fidelity=(
|
||||
TextFidelityRecord(
|
||||
page_index=0,
|
||||
source_page_number=1,
|
||||
pypdf_text_available=True,
|
||||
markdown_text_available=True,
|
||||
pypdf_hangul_count=10,
|
||||
markdown_hangul_count=8,
|
||||
hangul_count_delta=-2,
|
||||
hangul_count_ratio=0.8,
|
||||
unexpected_cjk_count=1,
|
||||
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
||||
markdown_hangul_spacing_anomaly_ratio=0.0,
|
||||
text_similarity=0.72,
|
||||
replacement_candidate=True,
|
||||
comparison_status="checked",
|
||||
),
|
||||
),
|
||||
warnings=(
|
||||
WarningRecord(WarningCode.TEXT_FIDELITY_LOW, WarningSeverity.WARNING, "Low text fidelity.", page_index=0),
|
||||
WarningRecord(
|
||||
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
|
||||
WarningSeverity.WARNING,
|
||||
"Unexpected CJK.",
|
||||
page_index=0,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
metadata = build_metadata(
|
||||
document=document,
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-11T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
)
|
||||
|
||||
assert metadata["text_fidelity"][0]["page_index"] == 0
|
||||
assert metadata["text_fidelity"][0]["replacement_candidate"] is True
|
||||
assert metadata["summary"]["text_fidelity_checked_page_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_low_page_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_unexpected_cjk_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_replacement_candidate_page_count"] == 1
|
||||
assert metadata["summary"]["text_fidelity_page_mapping_uncertain_count"] == 0
|
||||
|
||||
@@ -1,17 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.gpu import GpuInfo
|
||||
from pdf2md.mineru_adapter import (
|
||||
CommandResult,
|
||||
MinerUAdapter,
|
||||
MinerUOptions,
|
||||
StrictLocalViolationError,
|
||||
_run_command,
|
||||
)
|
||||
from pdf2md.mineru_profile import resolve_mineru_profile
|
||||
|
||||
|
||||
class FakeRunner:
|
||||
@@ -36,10 +40,16 @@ class EnvironmentRunner:
|
||||
def __init__(self) -> None:
|
||||
self.mineru_device_mode: str | None = None
|
||||
self.cuda_visible_devices: str | None = None
|
||||
self.processing_window_size: str | None = None
|
||||
self.max_concurrent_requests: str | None = None
|
||||
self.pdf_render_threads: str | None = None
|
||||
|
||||
def __call__(self, command: tuple[str, ...]) -> CommandResult:
|
||||
self.mineru_device_mode = os.environ.get("MINERU_DEVICE_MODE")
|
||||
self.cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
||||
self.processing_window_size = os.environ.get("MINERU_PROCESSING_WINDOW_SIZE")
|
||||
self.max_concurrent_requests = os.environ.get("MINERU_API_MAX_CONCURRENT_REQUESTS")
|
||||
self.pdf_render_threads = os.environ.get("MINERU_PDF_RENDER_THREADS")
|
||||
work_dir = Path(command[command.index("-o") + 1])
|
||||
work_dir.mkdir(parents=True, exist_ok=True)
|
||||
(work_dir / "paper.md").write_text("# Title\n", encoding="utf-8")
|
||||
@@ -133,6 +143,20 @@ def test_version_empty_output_is_explicit() -> None:
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_CLI_FAILED]
|
||||
|
||||
|
||||
def test_default_runner_decodes_utf8_process_output() -> None:
|
||||
code = (
|
||||
"import sys; "
|
||||
"sys.stdout.buffer.write('stdout ∙\\n'.encode('utf-8')); "
|
||||
"sys.stderr.buffer.write('stderr ∙\\n'.encode('utf-8'))"
|
||||
)
|
||||
|
||||
result = _run_command((sys.executable, "-c", code))
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert result.stdout == "stdout ∙\n"
|
||||
assert result.stderr == "stderr ∙\n"
|
||||
|
||||
|
||||
def test_build_command_is_list_based_and_deterministic(tmp_path: Path) -> None:
|
||||
adapter = MinerUAdapter(which=available, runner=FakeRunner())
|
||||
input_pdf = tmp_path / "논문 with spaces.pdf"
|
||||
@@ -200,7 +224,15 @@ def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path
|
||||
]
|
||||
assert result.engine == "MinerU"
|
||||
assert result.engine_version == "3.1.0"
|
||||
assert result.engine_options == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
assert result.engine_options == {
|
||||
"strict_local": True,
|
||||
"gpu_device": "cuda:0",
|
||||
"mineru_profile": {
|
||||
"requested": "auto",
|
||||
"applied": "auto",
|
||||
"environment": {},
|
||||
},
|
||||
}
|
||||
assert result.exit_code == 0
|
||||
assert result.stdout == "ok"
|
||||
assert result.stderr == "warn"
|
||||
@@ -209,6 +241,7 @@ def test_successful_mocked_output_parses_markdown_json_and_assets(tmp_path: Path
|
||||
def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_path: Path, monkeypatch) -> None:
|
||||
monkeypatch.setenv("MINERU_DEVICE_MODE", "cpu")
|
||||
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "7")
|
||||
monkeypatch.setenv("MINERU_PROCESSING_WINDOW_SIZE", "99")
|
||||
runner = EnvironmentRunner()
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
@@ -219,6 +252,53 @@ def test_gpu_option_sets_mineru_environment_and_restores_previous_values(tmp_pat
|
||||
assert runner.cuda_visible_devices == "0"
|
||||
assert os.environ["MINERU_DEVICE_MODE"] == "cpu"
|
||||
assert os.environ["CUDA_VISIBLE_DEVICES"] == "7"
|
||||
assert os.environ["MINERU_PROCESSING_WINDOW_SIZE"] == "99"
|
||||
|
||||
|
||||
def test_profile_option_sets_allowlisted_mineru_environment_and_engine_options(tmp_path: Path) -> None:
|
||||
gpu = GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00")
|
||||
profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True)
|
||||
runner = EnvironmentRunner()
|
||||
adapter = MinerUAdapter(which=available, runner=runner)
|
||||
|
||||
result = adapter.convert(
|
||||
tmp_path / "paper.pdf",
|
||||
tmp_path / "work",
|
||||
MinerUOptions(
|
||||
gpu_device="cuda:1",
|
||||
mineru_profile="performance",
|
||||
profile_environment=profile.environment,
|
||||
profile_engine_options=profile.to_engine_options(),
|
||||
),
|
||||
)
|
||||
|
||||
assert result.succeeded is True
|
||||
assert runner.mineru_device_mode == "cuda"
|
||||
assert runner.cuda_visible_devices == "1"
|
||||
assert runner.processing_window_size == "16"
|
||||
assert runner.max_concurrent_requests == "1"
|
||||
assert runner.pdf_render_threads == "4"
|
||||
assert result.engine_options["mineru_profile"]["applied"] == "performance"
|
||||
|
||||
|
||||
def test_profile_warnings_are_preserved_in_adapter_result(tmp_path: Path) -> None:
|
||||
gpu = GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00")
|
||||
profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True)
|
||||
adapter = MinerUAdapter(which=available, runner=EnvironmentRunner())
|
||||
|
||||
result = adapter.convert(
|
||||
tmp_path / "paper.pdf",
|
||||
tmp_path / "work",
|
||||
MinerUOptions(
|
||||
gpu_device="cuda:0",
|
||||
mineru_profile="performance",
|
||||
profile_environment=profile.environment,
|
||||
profile_engine_options=profile.to_engine_options(),
|
||||
profile_warnings=profile.warnings,
|
||||
),
|
||||
)
|
||||
|
||||
assert [warning.code for warning in result.warnings] == [WarningCode.MINERU_PROFILE_ADJUSTED]
|
||||
|
||||
|
||||
def test_nonzero_exit_does_not_parse_existing_outputs_or_fallback(tmp_path: Path) -> None:
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pdf2md.gpu import GpuInfo
|
||||
from pdf2md.ir import WarningCode, WarningSeverity
|
||||
from pdf2md.mineru_profile import resolve_mineru_profile
|
||||
|
||||
|
||||
SAFE_ENV = {
|
||||
"MINERU_PROCESSING_WINDOW_SIZE": "1",
|
||||
"MINERU_API_MAX_CONCURRENT_REQUESTS": "1",
|
||||
"MINERU_PDF_RENDER_THREADS": "1",
|
||||
}
|
||||
|
||||
|
||||
def test_auto_profile_uses_safe_values_without_gpu_inventory() -> None:
|
||||
profile = resolve_mineru_profile("auto", selected_gpu=None, cuda_requested=True)
|
||||
|
||||
assert profile.applied_profile == "safe"
|
||||
assert profile.environment == SAFE_ENV
|
||||
assert [warning.code for warning in profile.warnings] == [WarningCode.GPU_UNAVAILABLE]
|
||||
|
||||
|
||||
def test_auto_profile_uses_safe_values_for_gtx_1070_ti() -> None:
|
||||
gpu = GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00")
|
||||
|
||||
profile = resolve_mineru_profile("auto", selected_gpu=gpu, cuda_requested=True)
|
||||
|
||||
assert profile.requested_profile == "auto"
|
||||
assert profile.applied_profile == "safe"
|
||||
assert profile.environment == SAFE_ENV
|
||||
assert profile.selected_gpu_name == "NVIDIA GeForce GTX 1070 Ti"
|
||||
|
||||
|
||||
def test_auto_profile_uses_moderate_values_for_16gb_turing_or_newer_gpu() -> None:
|
||||
gpu = GpuInfo(index=0, name="NVIDIA RTX A4000", memory_total_mib=16384, driver_version="577.00")
|
||||
|
||||
profile = resolve_mineru_profile("auto", selected_gpu=gpu, cuda_requested=True)
|
||||
|
||||
assert profile.applied_profile == "auto"
|
||||
assert profile.environment == {
|
||||
"MINERU_PROCESSING_WINDOW_SIZE": "8",
|
||||
"MINERU_API_MAX_CONCURRENT_REQUESTS": "1",
|
||||
"MINERU_PDF_RENDER_THREADS": "4",
|
||||
}
|
||||
assert profile.warnings == ()
|
||||
|
||||
|
||||
def test_auto_profile_uses_conservative_values_for_12gb_to_16gb_gpu() -> None:
|
||||
gpu = GpuInfo(index=0, name="NVIDIA RTX 4070", memory_total_mib=12288, driver_version="577.00")
|
||||
|
||||
profile = resolve_mineru_profile("auto", selected_gpu=gpu, cuda_requested=True)
|
||||
|
||||
assert profile.applied_profile == "auto-conservative"
|
||||
assert profile.environment == {
|
||||
"MINERU_PROCESSING_WINDOW_SIZE": "4",
|
||||
"MINERU_API_MAX_CONCURRENT_REQUESTS": "1",
|
||||
"MINERU_PDF_RENDER_THREADS": "2",
|
||||
}
|
||||
|
||||
|
||||
def test_performance_profile_uses_performance_values_only_on_strong_gpu() -> None:
|
||||
gpu = GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00")
|
||||
|
||||
profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True)
|
||||
|
||||
assert profile.applied_profile == "performance"
|
||||
assert profile.environment == {
|
||||
"MINERU_PROCESSING_WINDOW_SIZE": "16",
|
||||
"MINERU_API_MAX_CONCURRENT_REQUESTS": "1",
|
||||
"MINERU_PDF_RENDER_THREADS": "4",
|
||||
}
|
||||
assert profile.selected_gpu_index == 1
|
||||
assert profile.selected_gpu_vram_mib == 24564
|
||||
|
||||
|
||||
def test_performance_profile_downgrades_to_safe_on_weak_gpu() -> None:
|
||||
gpu = GpuInfo(index=0, name="NVIDIA GeForce GTX 1070 Ti", memory_total_mib=8192, driver_version="577.00")
|
||||
|
||||
profile = resolve_mineru_profile("performance", selected_gpu=gpu, cuda_requested=True)
|
||||
|
||||
assert profile.applied_profile == "safe"
|
||||
assert profile.environment == SAFE_ENV
|
||||
assert [warning.code for warning in profile.warnings] == [WarningCode.MINERU_PROFILE_ADJUSTED]
|
||||
assert profile.warnings[0].severity == WarningSeverity.WARNING
|
||||
|
||||
|
||||
def test_profile_details_are_json_ready() -> None:
|
||||
gpu = GpuInfo(index=0, name="NVIDIA RTX A5000", memory_total_mib=24564, driver_version="577.00")
|
||||
|
||||
profile = resolve_mineru_profile("auto", selected_gpu=gpu, cuda_requested=True)
|
||||
|
||||
assert profile.to_engine_options() == {
|
||||
"requested": "auto",
|
||||
"applied": "auto",
|
||||
"environment": {
|
||||
"MINERU_API_MAX_CONCURRENT_REQUESTS": "1",
|
||||
"MINERU_PDF_RENDER_THREADS": "4",
|
||||
"MINERU_PROCESSING_WINDOW_SIZE": "8",
|
||||
},
|
||||
"selected_gpu": {
|
||||
"index": 0,
|
||||
"name": "NVIDIA RTX A5000",
|
||||
"memory_total_mib": 24564,
|
||||
"pre_turing_risk": False,
|
||||
},
|
||||
}
|
||||
+18
-16
@@ -73,14 +73,14 @@ def test_discovers_directory_recursive_with_relative_parents(tmp_path: Path) ->
|
||||
|
||||
|
||||
def test_discovery_order_is_deterministic_for_non_ascii_names(tmp_path: Path) -> None:
|
||||
touch(tmp_path / "한글.pdf")
|
||||
korean_pdf = touch(tmp_path / "논문.pdf")
|
||||
touch(tmp_path / "Alpha.pdf")
|
||||
touch(tmp_path / "beta.PDF")
|
||||
|
||||
first = discover_pdfs(tmp_path)
|
||||
second = discover_pdfs(tmp_path)
|
||||
|
||||
assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", "한글.pdf"]
|
||||
assert [item.source_path.name for item in first] == ["Alpha.pdf", "beta.PDF", korean_pdf.name]
|
||||
assert first == second
|
||||
|
||||
|
||||
@@ -91,22 +91,24 @@ def test_plans_all_default_output_paths_for_single_pdf(tmp_path: Path) -> None:
|
||||
[plan] = plan_pdf_outputs(pdf, output_root)
|
||||
|
||||
assert plan.source_pdf == pdf.resolve()
|
||||
assert plan.markdown_path == output_root.resolve() / "입력.md"
|
||||
assert plan.assets_dir == output_root.resolve() / "입력.assets"
|
||||
assert plan.metadata_path == output_root.resolve() / "입력.metadata.json"
|
||||
assert plan.report_path == output_root.resolve() / "입력.report.md"
|
||||
assert plan.markdown_path == output_root.resolve() / "입력" / "입력_001.md"
|
||||
assert plan.assets_dir == output_root.resolve() / "입력" / "images"
|
||||
assert plan.metadata_path is None
|
||||
assert plan.report_path == output_root.resolve() / "입력" / "입력_report.md"
|
||||
assert plan.raw_dir is None
|
||||
|
||||
|
||||
def test_plans_optional_metadata_and_raw_outputs(tmp_path: Path) -> None:
|
||||
def test_plans_metadata_flag_as_noop_and_raw_outputs(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
|
||||
[with_metadata_flag] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=True)
|
||||
[without_metadata] = plan_pdf_outputs(pdf, tmp_path / "out", metadata=False)
|
||||
[with_raw] = plan_pdf_outputs(pdf, tmp_path / "out", keep_raw=True)
|
||||
|
||||
assert with_metadata_flag.metadata_path is None
|
||||
assert without_metadata.metadata_path is None
|
||||
assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper.report.md"
|
||||
assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper.raw"
|
||||
assert without_metadata.report_path == (tmp_path / "out").resolve() / "paper" / "paper_report.md"
|
||||
assert with_raw.raw_dir == (tmp_path / "out").resolve() / "paper" / "raw"
|
||||
|
||||
|
||||
def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) -> None:
|
||||
@@ -117,8 +119,8 @@ def test_recursive_planning_preserves_relative_subdirectories(tmp_path: Path) ->
|
||||
plans = plan_pdf_outputs(root, tmp_path / "out", recursive=True)
|
||||
|
||||
assert [plan.markdown_path.relative_to((tmp_path / "out").resolve()) for plan in plans] == [
|
||||
Path("nested") / "same.md",
|
||||
Path("same.md"),
|
||||
Path("nested") / "same" / "same_001.md",
|
||||
Path("same") / "same_001.md",
|
||||
]
|
||||
|
||||
|
||||
@@ -137,21 +139,21 @@ def test_non_recursive_duplicate_output_paths_fail(tmp_path: Path) -> None:
|
||||
def test_output_conflicts_report_all_existing_paths(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
output_root = tmp_path / "out"
|
||||
(output_root / "paper.assets").mkdir(parents=True)
|
||||
(output_root / "paper.md").mkdir()
|
||||
touch(output_root / "paper.metadata.json")
|
||||
(output_root / "paper" / "images").mkdir(parents=True)
|
||||
(output_root / "paper" / "paper_001.md").mkdir()
|
||||
touch(output_root / "paper" / "paper_report.md")
|
||||
|
||||
with pytest.raises(OutputConflictError) as error:
|
||||
plan_pdf_outputs(pdf, output_root)
|
||||
|
||||
conflict_names = {path.name for path in error.value.conflicts}
|
||||
assert conflict_names == {"paper.assets", "paper.md", "paper.metadata.json"}
|
||||
assert conflict_names == {"images", "paper_001.md", "paper_report.md"}
|
||||
|
||||
|
||||
def test_overwrite_allows_existing_paths_without_deleting(tmp_path: Path) -> None:
|
||||
pdf = touch(tmp_path / "paper.pdf")
|
||||
output_root = tmp_path / "out"
|
||||
existing = touch(output_root / "paper.md")
|
||||
existing = touch(output_root / "paper" / "paper_001.md")
|
||||
|
||||
[plan] = plan_pdf_outputs(pdf, output_root, overwrite=True)
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ from pdf2md.ir import (
|
||||
BlockType,
|
||||
DocumentRecord,
|
||||
PageRecord,
|
||||
TextFidelityRecord,
|
||||
WarningCode,
|
||||
WarningRecord,
|
||||
WarningSeverity,
|
||||
@@ -161,3 +162,115 @@ def test_report_includes_chunk_context_when_metadata_has_chunk_options(tmp_path:
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "- Chunk: 2/3, source pages: 21-40" in report
|
||||
|
||||
|
||||
def test_report_includes_single_page_conversion_context(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(tmp_path)
|
||||
metadata["engine_options"] = {
|
||||
"strict_local": True,
|
||||
"page_conversion": {
|
||||
"mode": "single_page",
|
||||
"mineru_input_page_count": 1,
|
||||
"output_group_page_count": 20,
|
||||
"failed_source_pages": [],
|
||||
},
|
||||
}
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "- Page conversion mode: single-page MinerU inputs, grouped output size: 20" in report
|
||||
|
||||
|
||||
def test_report_includes_aggregate_output_folder_and_markdown_parts(tmp_path: Path) -> None:
|
||||
metadata = make_metadata(tmp_path)
|
||||
metadata["engine_options"] = {
|
||||
"strict_local": True,
|
||||
"output_folder": str(tmp_path / "out" / "paper"),
|
||||
"parts": [
|
||||
{
|
||||
"index": 1,
|
||||
"total": 2,
|
||||
"source_page_start": 1,
|
||||
"source_page_end": 20,
|
||||
"markdown_path": str(tmp_path / "out" / "paper" / "paper_001.md"),
|
||||
"status": "success",
|
||||
"warning_count": 0,
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"total": 2,
|
||||
"source_page_start": 21,
|
||||
"source_page_end": 23,
|
||||
"markdown_path": None,
|
||||
"status": "failed",
|
||||
"warning_count": 2,
|
||||
"failed_source_pages": [22, 23],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert f"- Output folder: {tmp_path / 'out' / 'paper'}" in report
|
||||
assert "paper_001.md (source pages 1-20, status success)" in report
|
||||
assert "- Markdown part 2/2: unavailable (source pages 21-23, status failed)" in report
|
||||
assert "- Failed source pages for part 2: 22, 23" in report
|
||||
|
||||
|
||||
def test_report_includes_text_fidelity_section_when_metadata_has_diagnostics(tmp_path: Path) -> None:
|
||||
document = DocumentRecord(
|
||||
source_pdf=tmp_path / "paper.pdf",
|
||||
pages=(PageRecord(page_index=0, blocks=(BlockRecord(BlockType.PARAGRAPH),)),),
|
||||
text_fidelity=(
|
||||
TextFidelityRecord(
|
||||
page_index=0,
|
||||
source_page_number=3,
|
||||
pypdf_text_available=True,
|
||||
markdown_text_available=True,
|
||||
pypdf_hangul_count=10,
|
||||
markdown_hangul_count=7,
|
||||
hangul_count_delta=-3,
|
||||
hangul_count_ratio=0.7,
|
||||
unexpected_cjk_count=2,
|
||||
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
||||
markdown_hangul_spacing_anomaly_ratio=0.0,
|
||||
text_similarity=0.61,
|
||||
replacement_candidate=True,
|
||||
comparison_status="checked",
|
||||
),
|
||||
TextFidelityRecord(
|
||||
page_index=1,
|
||||
source_page_number=4,
|
||||
pypdf_text_available=True,
|
||||
markdown_text_available=False,
|
||||
pypdf_hangul_count=5,
|
||||
markdown_hangul_count=0,
|
||||
hangul_count_delta=-5,
|
||||
hangul_count_ratio=0.0,
|
||||
unexpected_cjk_count=0,
|
||||
pypdf_hangul_spacing_anomaly_ratio=0.0,
|
||||
markdown_hangul_spacing_anomaly_ratio=0.0,
|
||||
text_similarity=0.0,
|
||||
replacement_candidate=False,
|
||||
comparison_status="page_mapping_uncertain",
|
||||
),
|
||||
),
|
||||
)
|
||||
metadata = build_metadata(
|
||||
document=document,
|
||||
source_sha256="0" * 64,
|
||||
created_at="2026-05-11T00:00:00Z",
|
||||
engine="MinerU",
|
||||
engine_version="3.1.0",
|
||||
)
|
||||
|
||||
report = render_report(metadata)
|
||||
|
||||
assert "## Text Fidelity" in report
|
||||
assert "- Checked page count: 1" in report
|
||||
assert "- Low-fidelity page count: 1" in report
|
||||
assert "- Unexpected CJK count: 2" in report
|
||||
assert "- Replacement candidate page count: 1" in report
|
||||
assert "- Low-similarity pages: 0" in report
|
||||
assert "- Unexpected-CJK pages: 0" in report
|
||||
assert "- Uncertain page-mapping pages: 1" in report
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pypdf import PdfWriter
|
||||
|
||||
from pdf2md.ir import WarningCode
|
||||
from pdf2md.text_fidelity import (
|
||||
check_text_fidelity,
|
||||
compare_text_pages,
|
||||
count_hangul_syllables,
|
||||
count_unexpected_cjk,
|
||||
hangul_spacing_anomaly_ratio,
|
||||
strip_markdown_for_text_fidelity,
|
||||
)
|
||||
|
||||
|
||||
def test_text_metric_helpers_count_hangul_cjk_and_spacing() -> None:
|
||||
assert count_hangul_syllables("응 력 A 曲") == 2
|
||||
assert count_unexpected_cjk("응 력 A 曲") == 1
|
||||
assert hangul_spacing_anomaly_ratio("응 력") == 1.0
|
||||
assert hangul_spacing_anomaly_ratio("응력") == 0.0
|
||||
|
||||
|
||||
def test_markdown_stripping_ignores_math_assets_and_code() -> None:
|
||||
markdown = "\n".join(
|
||||
[
|
||||
"# 제목",
|
||||
"",
|
||||
"본문 $x_i$ 유지",
|
||||
"```",
|
||||
"코드 한글",
|
||||
"```",
|
||||
"`인라인 코드` 마지막",
|
||||
]
|
||||
)
|
||||
|
||||
stripped = strip_markdown_for_text_fidelity(markdown)
|
||||
|
||||
assert "제목" in stripped
|
||||
assert "본문" in stripped
|
||||
assert "마지막" in stripped
|
||||
assert "figure" not in stripped
|
||||
assert "x_i" not in stripped
|
||||
assert "코드 한글" not in stripped
|
||||
assert "인라인 코드" not in stripped
|
||||
|
||||
|
||||
def test_compare_text_pages_flags_low_hangul_fidelity_and_replacement_candidate() -> None:
|
||||
result = compare_text_pages(
|
||||
source_pages=("쉘의 응력과 곡률을 계산한다",),
|
||||
markdown_pages=("쉘의 력과 曲률을 계산한다",),
|
||||
source_page_start=6,
|
||||
)
|
||||
|
||||
page = result.pages[0]
|
||||
assert page.comparison_status == "checked"
|
||||
assert page.source_page_number == 6
|
||||
assert page.pypdf_hangul_count > page.markdown_hangul_count
|
||||
assert page.unexpected_cjk_count == 1
|
||||
assert page.replacement_candidate is True
|
||||
assert [warning.code for warning in result.warnings] == [
|
||||
WarningCode.TEXT_LAYER_AVAILABLE,
|
||||
WarningCode.TEXT_FIDELITY_LOW,
|
||||
WarningCode.UNEXPECTED_CJK_IN_KOREAN_TEXT,
|
||||
]
|
||||
|
||||
|
||||
def test_compare_text_pages_allows_markdown_hangul_count_above_source() -> None:
|
||||
result = compare_text_pages(
|
||||
source_pages=("응력",),
|
||||
markdown_pages=("응력 변형률",),
|
||||
)
|
||||
|
||||
assert result.pages[0].hangul_count_ratio == 2.5
|
||||
assert result.pages[0].replacement_candidate is False
|
||||
|
||||
|
||||
def test_check_text_fidelity_marks_uncertain_page_mapping_for_multi_page_markdown() -> None:
|
||||
result = check_text_fidelity(
|
||||
Path("paper.pdf"),
|
||||
"첫 페이지와 둘째 페이지를 합친 Markdown",
|
||||
page_count=2,
|
||||
source_text_pages=("첫 페이지", "둘째 페이지"),
|
||||
)
|
||||
|
||||
assert [page.comparison_status for page in result.pages] == [
|
||||
"page_mapping_uncertain",
|
||||
"page_mapping_uncertain",
|
||||
]
|
||||
assert [warning.code for warning in result.warnings] == [
|
||||
WarningCode.TEXT_LAYER_AVAILABLE,
|
||||
WarningCode.TEXT_PAGE_MAPPING_UNCERTAIN,
|
||||
]
|
||||
|
||||
|
||||
def test_blank_generated_pdf_extraction_is_nonfatal(tmp_path: Path) -> None:
|
||||
pdf = tmp_path / "blank.pdf"
|
||||
writer = PdfWriter()
|
||||
writer.add_blank_page(width=72, height=72)
|
||||
with pdf.open("wb") as file:
|
||||
writer.write(file)
|
||||
|
||||
result = check_text_fidelity(pdf, "Markdown text", page_count=1)
|
||||
|
||||
assert result.pages[0].comparison_status == "source_text_missing"
|
||||
assert result.warnings == ()
|
||||
@@ -0,0 +1,235 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf2md_ui.runner import (
|
||||
CommandSpec,
|
||||
ResolvedCommand,
|
||||
RunningCommand,
|
||||
build_child_environment,
|
||||
build_convert_command,
|
||||
build_doctor_command,
|
||||
build_recheck_command,
|
||||
default_output_dir,
|
||||
resolve_cli_command,
|
||||
terminate_process_tree,
|
||||
)
|
||||
from pdf2md_ui.runner import CliResolutionError
|
||||
|
||||
|
||||
def test_resolves_pdf2md_from_path_before_uv(tmp_path: Path) -> None:
|
||||
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n", encoding="utf-8")
|
||||
|
||||
resolved = resolve_cli_command(
|
||||
project_root=tmp_path,
|
||||
which=lambda name: {"pdf2md": "pdf2md.exe", "uv": "uv.exe"}.get(name),
|
||||
)
|
||||
|
||||
assert resolved == ResolvedCommand(("pdf2md.exe",), cwd=None, source="path")
|
||||
|
||||
|
||||
def test_resolves_uv_run_with_project_root_when_pdf2md_missing(tmp_path: Path) -> None:
|
||||
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n", encoding="utf-8")
|
||||
|
||||
resolved = resolve_cli_command(
|
||||
project_root=tmp_path,
|
||||
which=lambda name: {"uv": "uv.exe"}.get(name),
|
||||
)
|
||||
|
||||
assert resolved == ResolvedCommand(("uv.exe", "run", "pdf2md"), cwd=tmp_path.resolve(), source="uv")
|
||||
|
||||
|
||||
def test_resolution_requires_project_root_for_uv() -> None:
|
||||
with pytest.raises(CliResolutionError):
|
||||
resolve_cli_command(which=lambda name: "uv.exe" if name == "uv" else None)
|
||||
|
||||
|
||||
def test_configured_command_must_be_pdf2md() -> None:
|
||||
with pytest.raises(CliResolutionError, match="pdf2md"):
|
||||
resolve_cli_command(configured_command="mineru.exe")
|
||||
|
||||
|
||||
def test_builds_doctor_command() -> None:
|
||||
resolved = ResolvedCommand(("uv", "run", "pdf2md"), cwd=Path("repo"), source="uv")
|
||||
|
||||
command = build_doctor_command(resolved)
|
||||
|
||||
assert command == CommandSpec(("uv", "run", "pdf2md", "doctor"), cwd=Path("repo"))
|
||||
|
||||
|
||||
def test_builds_convert_command_with_fixed_argument_list(tmp_path: Path) -> None:
|
||||
resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path")
|
||||
input_pdf = tmp_path / "?쇰Ц.pdf"
|
||||
output_dir = tmp_path / "outputs" / "?쇰Ц"
|
||||
|
||||
command = build_convert_command(
|
||||
resolved,
|
||||
input_pdf,
|
||||
output_dir,
|
||||
overwrite=True,
|
||||
keep_raw=True,
|
||||
chunk_pages=20,
|
||||
gpu="cuda:0",
|
||||
)
|
||||
|
||||
assert command.args == (
|
||||
"pdf2md",
|
||||
"convert",
|
||||
str(input_pdf),
|
||||
"--out",
|
||||
str(output_dir),
|
||||
"--overwrite",
|
||||
"--keep-raw",
|
||||
"--chunk-pages",
|
||||
"20",
|
||||
"--gpu",
|
||||
"cuda:0",
|
||||
"--mineru-profile",
|
||||
"auto",
|
||||
)
|
||||
|
||||
|
||||
def test_builds_recheck_command(tmp_path: Path) -> None:
|
||||
resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path")
|
||||
markdown = tmp_path / "paper.md"
|
||||
|
||||
command = build_recheck_command(resolved, markdown)
|
||||
|
||||
assert command.args == ("pdf2md", "recheck", str(markdown))
|
||||
|
||||
|
||||
def test_generated_commands_do_not_include_remote_or_api_options(tmp_path: Path) -> None:
|
||||
resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path")
|
||||
command = build_convert_command(resolved, tmp_path / "paper.pdf", tmp_path / "out")
|
||||
joined = " ".join(command.args).casefold()
|
||||
|
||||
for token in ("--api-url", "http://", "https://", "router", "openai", "mineru-api"):
|
||||
assert token not in joined
|
||||
|
||||
|
||||
def test_default_output_dir_uses_shared_output_root(tmp_path: Path) -> None:
|
||||
pdf = tmp_path / "?섍뎄議곕Ъ.pdf"
|
||||
|
||||
assert default_output_dir(pdf, base_dir=tmp_path) == tmp_path / "outputs"
|
||||
|
||||
|
||||
def test_convert_rejects_non_positive_chunk_pages(tmp_path: Path) -> None:
|
||||
resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path")
|
||||
|
||||
with pytest.raises(ValueError, match="positive"):
|
||||
build_convert_command(resolved, tmp_path / "paper.pdf", tmp_path / "out", chunk_pages=0)
|
||||
|
||||
|
||||
def test_convert_rejects_prohibited_gpu_tokens(tmp_path: Path) -> None:
|
||||
resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path")
|
||||
|
||||
with pytest.raises(ValueError, match="strict-local"):
|
||||
build_convert_command(resolved, tmp_path / "paper.pdf", tmp_path / "out", gpu="https://example.test")
|
||||
|
||||
|
||||
def test_convert_rejects_unknown_mineru_profile(tmp_path: Path) -> None:
|
||||
resolved = ResolvedCommand(("pdf2md",), cwd=None, source="path")
|
||||
|
||||
with pytest.raises(ValueError, match="mineru_profile"):
|
||||
build_convert_command(resolved, tmp_path / "paper.pdf", tmp_path / "out", mineru_profile="fast")
|
||||
|
||||
|
||||
def test_child_environment_defaults_mineru_model_source() -> None:
|
||||
environment = build_child_environment({"PATH": "x"})
|
||||
|
||||
assert environment["MINERU_MODEL_SOURCE"] == "local"
|
||||
|
||||
|
||||
def test_child_environment_preserves_existing_mineru_model_source() -> None:
|
||||
environment = build_child_environment({"MINERU_MODEL_SOURCE": "custom"})
|
||||
|
||||
assert environment["MINERU_MODEL_SOURCE"] == "custom"
|
||||
|
||||
|
||||
def test_running_command_uses_shell_false_and_streams_output() -> None:
|
||||
captured: dict[str, object] = {}
|
||||
events = []
|
||||
|
||||
class FakeProcess:
|
||||
pid = 123
|
||||
stdout = iter(["hello\n", "done\n"])
|
||||
|
||||
def wait(self, timeout=None):
|
||||
return 0
|
||||
|
||||
def poll(self):
|
||||
return 0
|
||||
|
||||
def fake_popen(*args, **kwargs):
|
||||
captured["args"] = args
|
||||
captured["kwargs"] = kwargs
|
||||
return FakeProcess()
|
||||
|
||||
runner = RunningCommand(CommandSpec(("pdf2md", "doctor")), events.append, popen_factory=fake_popen, base_env={})
|
||||
|
||||
assert runner.run() == 0
|
||||
assert captured["args"] == (("pdf2md", "doctor"),)
|
||||
assert captured["kwargs"]["shell"] is False
|
||||
assert captured["kwargs"]["stderr"] is subprocess.STDOUT
|
||||
assert captured["kwargs"]["env"]["MINERU_MODEL_SOURCE"] == "local"
|
||||
assert [(event.kind, event.message, event.exit_code) for event in events] == [
|
||||
("start", "pdf2md doctor", None),
|
||||
("output", "hello", None),
|
||||
("output", "done", None),
|
||||
("exit", "Command exited with code 0.", 0),
|
||||
]
|
||||
|
||||
|
||||
def test_cancel_uses_taskkill_after_windows_grace_timeout() -> None:
|
||||
taskkill_calls = []
|
||||
|
||||
class SlowProcess:
|
||||
pid = 456
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.wait_count = 0
|
||||
self.terminated = False
|
||||
|
||||
def poll(self):
|
||||
return None
|
||||
|
||||
def terminate(self) -> None:
|
||||
self.terminated = True
|
||||
|
||||
def wait(self, timeout=None):
|
||||
self.wait_count += 1
|
||||
if self.wait_count == 1:
|
||||
raise subprocess.TimeoutExpired("pdf2md", timeout)
|
||||
return 1
|
||||
|
||||
def fake_taskkill(*args, **kwargs):
|
||||
taskkill_calls.append((args, kwargs))
|
||||
return subprocess.CompletedProcess(args[0], 0)
|
||||
|
||||
process = SlowProcess()
|
||||
|
||||
assert terminate_process_tree(process, grace_seconds=0, taskkill_runner=fake_taskkill, os_name="nt")
|
||||
assert process.terminated
|
||||
assert taskkill_calls[0][0][0] == ["taskkill", "/pid", "456", "/t", "/f"]
|
||||
|
||||
|
||||
def test_cancel_does_not_taskkill_when_process_exits_promptly() -> None:
|
||||
taskkill_calls = []
|
||||
|
||||
class FastProcess:
|
||||
pid = 789
|
||||
|
||||
def poll(self):
|
||||
return None
|
||||
|
||||
def terminate(self) -> None:
|
||||
pass
|
||||
|
||||
def wait(self, timeout=None):
|
||||
return 0
|
||||
|
||||
assert terminate_process_tree(FastProcess(), taskkill_runner=lambda *args, **kwargs: taskkill_calls.append(args))
|
||||
assert taskkill_calls == []
|
||||
Reference in New Issue
Block a user