111 lines
3.6 KiB
Python
111 lines
3.6 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
if os.environ.get("PDF2MD_RUN_MINERU_FIXTURES") != "1":
|
|
pytest.skip(
|
|
"optional local MinerU fixture evaluation is disabled; set PDF2MD_RUN_MINERU_FIXTURES=1 to run",
|
|
allow_module_level=True,
|
|
)
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
SAMPLES_DIR = REPO_ROOT / "samples"
|
|
|
|
|
|
def test_optional_local_mineru_samples_produce_release_outputs(tmp_path: Path) -> None:
|
|
doctor = subprocess.run(
|
|
[sys.executable, "-m", "pdf2md.cli", "doctor"],
|
|
cwd=REPO_ROOT,
|
|
check=False,
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
if doctor.returncode != 0:
|
|
pytest.skip(f"local MinerU fixture evaluation blocked by doctor:\n{doctor.stdout}\n{doctor.stderr}")
|
|
|
|
sample_pdfs = tuple(sorted(SAMPLES_DIR.glob("*.pdf"), key=lambda path: path.name.casefold()))
|
|
if not sample_pdfs:
|
|
pytest.skip(f"no local sample PDFs found under {SAMPLES_DIR}")
|
|
|
|
output_root = tmp_path / "mineru-fixture-output"
|
|
attempts: list[dict[str, object]] = []
|
|
for pdf in sample_pdfs:
|
|
completed = subprocess.run(
|
|
[
|
|
sys.executable,
|
|
"-m",
|
|
"pdf2md.cli",
|
|
"convert",
|
|
str(pdf),
|
|
"--out",
|
|
str(output_root),
|
|
],
|
|
cwd=REPO_ROOT,
|
|
check=False,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=1800,
|
|
)
|
|
attempts.append(
|
|
{
|
|
"source": str(pdf.relative_to(REPO_ROOT)),
|
|
"command": " ".join(
|
|
[
|
|
sys.executable,
|
|
"-m",
|
|
"pdf2md.cli",
|
|
"convert",
|
|
str(pdf),
|
|
"--out",
|
|
str(output_root),
|
|
]
|
|
),
|
|
"exit_code": completed.returncode,
|
|
"stdout": completed.stdout,
|
|
"stderr": completed.stderr,
|
|
}
|
|
)
|
|
assert completed.returncode == 0, json.dumps(attempts[-1], ensure_ascii=False, indent=2)
|
|
|
|
sample_output = output_root / pdf.stem
|
|
markdown_path = sample_output / f"{pdf.stem}_001.md"
|
|
report_path = sample_output / f"{pdf.stem}_report.md"
|
|
assert markdown_path.exists()
|
|
assert report_path.exists()
|
|
assert not list(sample_output.glob("*.metadata.json"))
|
|
|
|
report = report_path.read_text(encoding="utf-8")
|
|
assert "Output Markdown:" in report
|
|
assert "Metadata JSON:" not in report
|
|
assert "Report Markdown:" in report
|
|
assert "- Engine: MinerU" in report
|
|
assert "- Pages processed:" in report
|
|
assert "- Warning count:" in report
|
|
assert "- Math render error count:" in report
|
|
assert "- Asset count:" in report
|
|
attempts[-1].update(
|
|
{
|
|
"markdown_path": str(markdown_path),
|
|
"report_path": str(report_path),
|
|
"final_status": _report_final_status(report),
|
|
}
|
|
)
|
|
|
|
record_path = output_root / "fixture-evaluation.json"
|
|
record_path.write_text(json.dumps({"attempts": attempts}, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
assert record_path.exists()
|
|
|
|
|
|
def _report_final_status(report: str) -> str:
|
|
match = re.search(r"^- Final status: `(?P<status>[^`]+)`$", report, re.MULTILINE)
|
|
return match.group("status") if match else "unavailable"
|