modify pdftomd
This commit is contained in:
+112
-16
@@ -2,6 +2,8 @@ from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from importlib.metadata import entry_points
|
||||
from pathlib import Path
|
||||
@@ -9,8 +11,10 @@ from pathlib import Path
|
||||
import pytest
|
||||
from pypdf import PdfWriter
|
||||
|
||||
import pdf2md.conversion as conversion_module
|
||||
from pdf2md.cli import main
|
||||
from pdf2md.doctor import DoctorCheck, DoctorReport
|
||||
from pdf2md.gpu import GpuInfo
|
||||
from pdf2md.ir import WarningCode, WarningRecord, WarningSeverity
|
||||
from pdf2md.mineru_adapter import MinerUAdapterResult
|
||||
|
||||
@@ -69,6 +73,24 @@ def make_pdf_with_pages(directory: Path, name: str, page_count: int) -> Path:
|
||||
return path
|
||||
|
||||
|
||||
def write_legacy_metadata(markdown_path: Path, source_pdf: Path) -> Path:
|
||||
metadata_path = markdown_path.with_suffix(".metadata.json")
|
||||
metadata = {
|
||||
"source_pdf": str(source_pdf.resolve()),
|
||||
"source_sha256": hashlib.sha256(source_pdf.read_bytes()).hexdigest(),
|
||||
"created_at": "2026-05-08T00:00:00Z",
|
||||
"engine": "MinerU",
|
||||
"engine_version": "3.1.0",
|
||||
"engine_options": {"strict_local": True},
|
||||
"pages": [{"page_index": 0, "blocks": []}],
|
||||
"assets": [],
|
||||
"warnings": [],
|
||||
"summary": {"pages_processed": 1, "warning_count": 0},
|
||||
}
|
||||
metadata_path.write_text(json.dumps(metadata, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
return metadata_path
|
||||
|
||||
|
||||
def test_console_script_entry_point_is_reserved() -> None:
|
||||
scripts = {entry_point.name: entry_point for entry_point in entry_points(group="console_scripts")}
|
||||
|
||||
@@ -128,18 +150,91 @@ def test_cli_convert_single_pdf_writes_outputs_and_summary(tmp_path: Path, capsy
|
||||
out = tmp_path / "out"
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out), "--metadata"], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "converted: 1" in captured.out
|
||||
assert "failed: 0" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert (out / "paper.md").exists()
|
||||
assert (out / "paper.metadata.json").exists()
|
||||
assert (out / "paper.report.md").exists()
|
||||
assert (out / "paper" / "paper_001.md").exists()
|
||||
assert not list(out.rglob("*.metadata.json"))
|
||||
assert (out / "paper" / "paper_report.md").exists()
|
||||
assert adapter.calls == [pdf.resolve()]
|
||||
assert adapter.options[0].to_engine_options() == {"strict_local": True, "gpu_device": "cuda:0"}
|
||||
assert adapter.options[0].to_engine_options() == {
|
||||
"strict_local": True,
|
||||
"gpu_device": "cuda:0",
|
||||
"mineru_profile": {
|
||||
"requested": "auto",
|
||||
"applied": "safe",
|
||||
"environment": {
|
||||
"MINERU_API_MAX_CONCURRENT_REQUESTS": "1",
|
||||
"MINERU_PDF_RENDER_THREADS": "1",
|
||||
"MINERU_PROCESSING_WINDOW_SIZE": "1",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_cli_convert_accepts_safe_mineru_profile(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(
|
||||
["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "safe"],
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert adapter.options[0].to_engine_options()["mineru_profile"]["requested"] == "safe"
|
||||
|
||||
|
||||
def test_cli_convert_accepts_performance_mineru_profile(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(
|
||||
["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "performance"],
|
||||
adapter=adapter,
|
||||
clock=fixed_clock,
|
||||
)
|
||||
|
||||
capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
profile = adapter.options[0].to_engine_options()["mineru_profile"]
|
||||
assert profile["requested"] == "performance"
|
||||
assert profile["applied"] == "safe"
|
||||
|
||||
|
||||
def test_cli_convert_rejects_invalid_mineru_profile(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
|
||||
with pytest.raises(SystemExit) as error:
|
||||
main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--mineru-profile", "fast"])
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert error.value.code == 2
|
||||
assert "invalid choice" in captured.err
|
||||
|
||||
|
||||
def test_cli_convert_gpu_auto_selects_largest_visible_gpu(tmp_path: Path, capsys, monkeypatch) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
adapter = FakeAdapter()
|
||||
inventory = (
|
||||
GpuInfo(index=0, name="NVIDIA RTX 4060", memory_total_mib=8192, driver_version="577.00"),
|
||||
GpuInfo(index=1, name="NVIDIA RTX 4090", memory_total_mib=24564, driver_version="577.00"),
|
||||
)
|
||||
monkeypatch.setattr(conversion_module, "query_nvidia_gpus", lambda: inventory)
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(tmp_path / "out"), "--gpu", "auto"], adapter=adapter, clock=fixed_clock)
|
||||
|
||||
capsys.readouterr()
|
||||
options = adapter.options[0].to_engine_options()
|
||||
assert exit_code == 0
|
||||
assert options["gpu_device"] == "cuda:1"
|
||||
assert options["mineru_profile"]["selected_gpu"]["name"] == "NVIDIA RTX 4090"
|
||||
|
||||
|
||||
def test_cli_convert_directory_is_deterministic(tmp_path: Path, capsys) -> None:
|
||||
@@ -173,7 +268,7 @@ def test_cli_convert_recursive_only_when_requested(tmp_path: Path, capsys) -> No
|
||||
assert exit_code == 0
|
||||
assert [path.name for path in adapter.calls] == ["child.pdf", "top.pdf"]
|
||||
assert "converted: 2" in captured.out
|
||||
assert (tmp_path / "out" / "nested" / "child.md").exists()
|
||||
assert (tmp_path / "out" / "nested" / "child" / "child_001.md").exists()
|
||||
|
||||
|
||||
def test_cli_failure_summary_returns_nonzero(tmp_path: Path, capsys) -> None:
|
||||
@@ -204,22 +299,23 @@ def test_cli_recheck_markdown_regenerates_adjacent_metadata_and_report(tmp_path:
|
||||
)
|
||||
capsys.readouterr()
|
||||
|
||||
markdown_path = out / "paper.md"
|
||||
markdown_path = out / "paper" / "paper_001.md"
|
||||
markdown_path.write_text("Inline $x_i$\n", encoding="utf-8")
|
||||
write_legacy_metadata(markdown_path, pdf)
|
||||
exit_code = main(["recheck", str(markdown_path)], clock=fixed_clock, math_checker=lambda _: True)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "rechecked:" in captured.out
|
||||
assert "warnings: 0" in captured.out
|
||||
assert "- Final status: `success`" in (out / "paper.report.md").read_text(encoding="utf-8")
|
||||
assert "- Final status: `success`" in markdown_path.with_suffix(".report.md").read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_cli_preflight_conflict_fails_before_conversion(tmp_path: Path, capsys) -> None:
|
||||
pdf = make_pdf(tmp_path, "paper.pdf")
|
||||
out = tmp_path / "out"
|
||||
out.mkdir()
|
||||
(out / "paper.md").write_text("old", encoding="utf-8")
|
||||
(out / "paper").mkdir(parents=True)
|
||||
(out / "paper" / "paper_001.md").write_text("old", encoding="utf-8")
|
||||
adapter = FakeAdapter()
|
||||
|
||||
exit_code = main(["convert", str(pdf), "--out", str(out)], adapter=adapter, clock=fixed_clock)
|
||||
@@ -240,12 +336,12 @@ def test_cli_convert_chunk_pages_flag_uses_default_twenty_pages(tmp_path: Path,
|
||||
captured = capsys.readouterr()
|
||||
assert exit_code == 0
|
||||
assert "converted: 2" in captured.out
|
||||
assert [path.name for path in adapter.calls] == [
|
||||
"long.part-001.pages-001-020.pdf",
|
||||
"long.part-002.pages-021-021.pdf",
|
||||
]
|
||||
assert (out / "long.part-001.pages-001-020.md").exists()
|
||||
assert (out / "long.part-002.pages-021-021.md").exists()
|
||||
assert len(adapter.calls) == 21
|
||||
assert [path.name for path in adapter.calls[:3]] == ["long.page-001.pdf", "long.page-002.pdf", "long.page-003.pdf"]
|
||||
assert (out / "long" / "long_001.md").exists()
|
||||
assert (out / "long" / "long_002.md").exists()
|
||||
assert (out / "long" / "long_report.md").exists()
|
||||
assert not list(out.rglob("*.metadata.json"))
|
||||
|
||||
|
||||
def test_cli_convert_rejects_non_positive_chunk_pages(tmp_path: Path, capsys) -> None:
|
||||
|
||||
Reference in New Issue
Block a user